Skip to content

Commit

Permalink
fix reading big endian
Browse files Browse the repository at this point in the history
  • Loading branch information
JanMarvin committed Jun 16, 2018
1 parent e1d7403 commit 79e6ce4
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 45 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: readspss
Type: Package
Title: Importing SPSS Files
Version: 0.3.1
Version: 0.3.2
Authors@R: c(
person("Jan Marvin", "Garbuszus",
email = "[email protected]", role = c("aut", "cre")),
Expand Down
113 changes: 69 additions & 44 deletions src/readsav.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,9 +92,12 @@ List sav(const char * filePath, const bool debug, std::string encStr,
// file format? should be 2 or 3
arch = readbin(arch, sav, swapit);

if ((arch <2) & (arch > 3))
if ((arch <2) | (arch > 3))
swapit = true;

if (debug)
Rprintf("swapit: %d\n", swapit);

k = readbin(k, sav, swapit);

if (debug)
Expand Down Expand Up @@ -166,6 +169,8 @@ List sav(const char * filePath, const bool debug, std::string encStr,
rtype = readbin(rtype, sav, swapit);


if (debug)
Rprintf("rtype: %d\n", rtype);

while( rtype < 999 )
{
Expand All @@ -185,12 +190,13 @@ List sav(const char * filePath, const bool debug, std::string encStr,
int32_t typeINT = 0, has_var_label = 0, n_missing_values = 0,
printINT = 0, writeINT = 0, lablen32 = 0;

while (rtype == 2) {
while (rtype == 2)
{

// skip 20 bytes or read 5 unks
vtype = readbin(vtype, sav, swapit); // Variable type
vtype = readbin(vtype, sav, swapit); // Variable type
vlflag = readbin(vlflag, sav, swapit); // Label flag
nmiss = readbin(nmiss, sav, swapit);
nmiss = readbin(nmiss, sav, swapit);

// bits of int32_t define digits, width and type
var4 = readbin(var4, sav, swapit);
Expand Down Expand Up @@ -246,6 +252,16 @@ List sav(const char * filePath, const bool debug, std::string encStr,

varnames.push_back(nvarname);

if (debug) {
Rcout << nvarname << std::endl;
Rprintf("vflag %d\n", vlflag);
Rprintf("nmiss %d\n", nmiss);
Rprintf("var41 %d\n", var41);
Rprintf("var42 %d\n", var42);
Rprintf("var43 %d\n", var43);
Rprintf("var44 %d\n", var44);
}

int32_t origlen = 0;
if(vlflag==1)
{
Expand All @@ -258,11 +274,10 @@ List sav(const char * filePath, const bool debug, std::string encStr,
for(int i=1; i<4; ++i)
{
if ((origlen+i)%4==0)
origlen = origlen+i;
origlen = origlen + i;
}
}

// Rprintf("%d \n", rtype);
std::string vallabel (origlen, '\0');
vallabel = readstring(vallabel, sav, vallabel.size());

Expand All @@ -272,60 +287,64 @@ List sav(const char * filePath, const bool debug, std::string encStr,
std::regex("^ +| +$"), "$1");

vallabels.push_back(vallabel);
}

// -----------------------------------------
// missings
//
// -----------------------------------------
// missings
//

int8_t const nmisstype = std::abs(nmiss);
int8_t const nmisstype = std::abs(nmiss);

// SPSS knows 5 different missing types. -3, -2 are range types. 1, 2,
// 3 are discrete types. Range types have min range and max range. -3
// has an additional discrete value.
// SPSS knows 5 different missing types. -3, -2 are range types. 1, 2,
// 3 are discrete types. Range types have min range and max range. -3
// has an additional discrete value.


if (nmisstype > 0) {
if (debug)
Rprintf("vflag %d\n", vlflag);

// missing values
// Vector needs to be of size n+1, because the first value will be
// nmisstype followed by nmisstype values
Rcpp::NumericVector missing(nmisstype+1);
Rcpp::CharacterVector missingV(nmisstype+1);
// PSPP states that long strings are handled differently
if (nmisstype > 0) {

double miss0 = 0;
bool noNum = false;
// missing values
// Vector needs to be of size n+1, because the first value will be
// nmisstype followed by nmisstype values
Rcpp::NumericVector missing(nmisstype+1);
Rcpp::CharacterVector missingV(nmisstype+1);

if (vtype != 0)
noNum = true;
double miss0 = 0;
bool noNum = false;

for (int32_t i = 0; i < nmisstype; ++i) {
if (vtype != 0)
noNum = true;

if (noNum) {
std::string mV (8, '\0');
mV = readstring(mV, sav, mV.size());
for (int32_t i = 0; i < nmisstype; ++i) {

if (noNum) {
std::string mV (8, '\0');
mV = readstring(mV, sav, mV.size());

mV = std::regex_replace(mV, std::regex("^ +| +$"), "$1");

missingV(0) = nmiss;
missingV(i+1) = mV;
mV = std::regex_replace(mV, std::regex("^ +| +$"), "$1");

} else {
missingV(0) = nmiss;
missingV(i+1) = mV;

miss0 = readbin(miss0, sav, swapit);
} else {

missing(0) = nmiss;
missing(i+1) = miss0;
}
miss0 = readbin(miss0, sav, swapit);

missing(0) = nmiss;
missing(i+1) = miss0;
}

if (noNum)
missings.push_back(missingV);
else
missings.push_back(missing);

}

if (noNum)
missings.push_back(missingV);
else
missings.push_back(missing);

}

break;
Expand Down Expand Up @@ -366,6 +385,8 @@ List sav(const char * filePath, const bool debug, std::string encStr,

} else {
memcpy(&coden , cV.c_str(), sizeof(double));
if (swapit) coden = swap_endian(coden);

code(i) = coden;
}

Expand Down Expand Up @@ -450,7 +471,7 @@ List sav(const char * filePath, const bool debug, std::string encStr,

// trim additional whitespaces to the right
docline = std::regex_replace(docline,
std::regex(" +$"), "$1");
std::regex(" +$"), "$1");

Document(i) = docline;

Expand All @@ -464,7 +485,8 @@ List sav(const char * filePath, const bool debug, std::string encStr,


// additional information
while (rtype==7) {
while (rtype==7)
{
Rcpp::checkUserInterrupt();

// subtype integer: 3 / floating: 4 / varsyst: 11
Expand Down Expand Up @@ -607,7 +629,10 @@ List sav(const char * filePath, const bool debug, std::string encStr,

Rcout << data << std::endl;

Rcout << "unknown subtype " << subtyp << " detected" << std::endl;
Rcout << "unknown subtype " << subtyp << " detected." << std::endl;
Rcout << "most likely no readson to worry. but if you want\n" <<
"to help me out and can share a row of this datafile, \n" <<
"please mail me!" << std::endl;

break;
}
Expand Down Expand Up @@ -732,7 +757,7 @@ List sav(const char * filePath, const bool debug, std::string encStr,
// only uint8_t it stores 8 vals. If data contains doubles it stores a
// 253 and the next 8 byte will be the double.

chunk = readbin(val_d, sav, swapit);
chunk = readbin(val_d, sav, 0);

IntegerVector chunkvec(8);

Expand Down

1 comment on commit 79e6ce4

@JanMarvin
Copy link
Owner Author

@JanMarvin JanMarvin commented on 79e6ce4 Jun 16, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ints var4 and var5 are not to be swapped either. First bit is first bit and not after some swapping the last bit. It's of minor importance since these are only used to identify dates.

[Edit:] Everything is fine.

Please sign in to comment.