Merge pull request #48 from sjewo/testing

Merge Testing for 0.9.0 release
sjewo · May 5, 2017 · 36e5803 · 36e5803
2 parents fac6df6 + 96339eb
commit 36e5803
Show file tree

Hide file tree

Showing 20 changed files with 496 additions and 117 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,14 +1,16 @@
 Package: readstata13
 Type: Package
 Title: Import 'Stata' Data Files
-Version: 0.8.5
+Version: 0.9.0
 Authors@R: c(
     person("Jan Marvin", "Garbuszus",
     email = "[email protected]", role = c("aut")),
     person("Sebastian", "Jeworutzki",
     email="[email protected]", role = c("aut", "cre")),
     person("R Core Team", role="cph"),
-    person("Magnus Thor", "Torfason", role="ctb")
+    person("Magnus Thor", "Torfason", role="ctb"),
+    person("Luke M.", "Olson", role="ctb"),
+    person("Giovanni", "Righi", role="ctb")
     )
 Description: Function to read and write the 'Stata' file format.
 URL: https://github.com/sjewo/readstata13

diff --git a/NEWS b/NEWS
@@ -1,3 +1,11 @@
+[0.9.0]
+- generate unique factor labels to prevent errors in factor definition
+- check interrupt for long read
+- fix storage size of character vectors in save.dta13
+- fix saving characters containing missings
+- implement partial reading of dta-files
+- fix an integer bug with saving data.frames of length requiring uint64_t
+
 0.8.5
 - fix errors on big-endian systems
 
@@ -11,7 +19,7 @@
 - Stop compression of doubles as floats. Now test if compression of doubles as
   interger types is possible.
 - add many function tests
-  
+
 
 0.8.2
 - save NA values in character vector as empty string

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -5,8 +5,8 @@ stata_pre13_save <- function(filePath, dat) {
     .Call('readstata13_stata_pre13_save', PACKAGE = 'readstata13', filePath, dat)
 }
 
-stata_read <- function(filePath, missing) {
-    .Call('readstata13_stata_read', PACKAGE = 'readstata13', filePath, missing)
+stata_read <- function(filePath, missing, selectrows) {
+    .Call('readstata13_stata_read', PACKAGE = 'readstata13', filePath, missing, selectrows)
 }
 
 stata_save <- function(filePath, dat) {

diff --git a/R/read.R b/R/read.R
@@ -23,11 +23,12 @@
 #' @param convert.factors \emph{logical.} If \code{TRUE}, factors from Stata
 #'  value labels are created.
 #' @param generate.factors \emph{logical.} If \code{TRUE} and convert.factors is
-#'  TRUE, missing factor labels are created from integers.
+#'  TRUE, missing factor labels are created from integers. If duplicated labels are found, 
+#'  unique labels will be generated according the following scheme: "label_(integer code)".
 #' @param encoding \emph{character.} Strings can be converted from Windows-1252 or UTF-8
 #'  to system encoding. Options are "latin1" or "UTF-8" to specify target
 #'  encoding explicitly. Stata 14 files are UTF-8 encoded and may contain strings
-#'   which can't be displayed in the current locale. 
+#'   which can't be displayed in the current locale.
 #'   Set encoding=NULL to stop reencoding.
 #' @param fromEncoding \emph{character.} We expect strings to be encoded as
 #'  "CP1252" for Stata Versions 13 and older. For dta files saved with Stata 14
@@ -45,8 +46,11 @@
 #'  converted.
 #' @param add.rownames \emph{logical.} If \code{TRUE}, the first column will be
 #'  used as rownames. Variable will be dropped afterwards.
-#' @param nonint.factors \emph{logical.} If \code{TRUE}, factors labels 
+#' @param nonint.factors \emph{logical.} If \code{TRUE}, factors labels
 #'  will be assigned to variables of type float and double.
+#' @param select.rows \emph{integer.} Vector of one or two numbers. If single
+#'  value rows from 1:val are selected. If two values of a range are selected
+#'  the rows in range will be selected.
 #'
 #' @details If the filename is a url, the file will be downloaded as a temporary
 #'  file and read afterwards.
@@ -66,17 +70,17 @@
 #'
 #' Stata 13 introduced a new character type called strL. strLs are able to store
 #'  strings up to 2 billion characters.  While R is able to store
-#'  strings of this size in a character vector, the printed representation of such 
-#'  vectors looks rather cluttered, so it's possible to save only a reference in the 
-#'  data.frame with option \code{replace.strl=FALSE}. 
+#'  strings of this size in a character vector, the printed representation of such
+#'  vectors looks rather cluttered, so it's possible to save only a reference in the
+#'  data.frame with option \code{replace.strl=FALSE}.
 #'
 #' In R, you may use rownames to store characters (see for instance
 #'  \code{data(swiss)}). In Stata, this is not possible and rownames have to be
-#'  stored as a variable. If you want to use rownames, set add.rownames to TRUE. 
-#'  Then the first variable of the dta-file will hold the rownames of the resulting 
+#'  stored as a variable. If you want to use rownames, set add.rownames to TRUE.
+#'  Then the first variable of the dta-file will hold the rownames of the resulting
 #'  data.frame.
 #'
-#' Reading dta-files of older and newer versions than 13 was introduced 
+#' Reading dta-files of older and newer versions than 13 was introduced
 #'  with version 0.8.
 #' @return The function returns a data.frame with attributes. The attributes
 #'  include
@@ -107,14 +111,15 @@
 #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
 #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de}
 #' @useDynLib readstata13
-#' @importFrom utils download.file 
-#' @importFrom stats na.omit 
+#' @importFrom utils download.file
+#' @importFrom stats na.omit
 #' @export
 read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
                        encoding = "UTF-8", fromEncoding=NULL,
                        convert.underscore = FALSE, missing.type = FALSE,
                        convert.dates = TRUE, replace.strl = TRUE,
-                       add.rownames = FALSE, nonint.factors=FALSE) {
+                       add.rownames = FALSE, nonint.factors=FALSE,
+                       select.rows = NULL) {
   # Check if path is a url
   if (length(grep("^(http|ftp|https)://", file))) {
     tmp <- tempfile()
@@ -128,7 +133,39 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
   if (!file.exists(filepath))
     return(message("File not found."))
 
-  data <- stata_read(filepath,missing.type)
+
+
+  # some select.row checks
+  if (!is.null(select.rows)) {
+    # check that it is a numeric
+    if (!is.numeric(select.rows)){
+      return(message("select.rows must be of type numeric"))
+    } else {
+      # guard against negative values
+      if (any(select.rows < 0) )
+        select.rows <- abs(select.rows)
+
+      # check that lenght is not > 2
+      if (length(select.rows) > 2)
+        return(message("select.rows must be of length 1 or 2."))
+
+      # if lenght 1 start at row 1
+      if (length(select.rows) == 1)
+        select.rows <- c(1, select.rows)
+    }
+    # reorder if 2 is bigger than 1
+    if (select.rows[2] < select.rows[1])
+      select.rows <- c(select.rows[2], select.rows[1])
+
+    # make sure to start at index position 1 if select.rows[2] > 0
+    if (select.rows[2] > 0 & select.rows[1] == 0)
+      select.rows[1] <- 1
+  } else {
+    # set a value
+    select.rows <- c(0,0)
+  }
+
+  data <- stata_read(filepath, missing.type, select.rows)
 
   version <- attr(data, "version")
 
@@ -201,7 +238,7 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
 
   ## Encoding
   if(!is.null(encoding)) {
-    
+
     # set from encoding by dta version
     if(is.null(fromEncoding)) {
       fromEncoding <- "CP1252"
@@ -312,30 +349,41 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,
       if (labname %in% names(label)) {
         if((vartype == sdouble | vartype == sfloat)) {
           if(!nonint.factors) {
-            warning(paste0("\n  ",vnames[i], ":\n  Factor codes of type double or float detected - no labels assigned.\n  Set option nonint.factors to TRUE to assign labels anyway."))
+            warning(paste0("\n  ",vnames[i], ":\n  Factor codes of type double or float detected - no labels assigned.\n  Set option nonint.factors to TRUE to assign labels anyway.\n"))
             next
           }
         }
         # get unique values / omit NA
         varunique <- na.omit(unique(data[, i]))
         # assign label if label set is complete
         if (all(varunique %in% labtable)) {
+
+          #check for duplicated labels
+          labcount <- table(names(labtable))
+          if(any(labcount > 1)) {
+            warning(paste0("\n  ",vnames[i], ":\n  Duplicated factor levels detected - generating unique labels.\n"))
+            labdups <- names(labtable) %in% names(labcount[labcount > 1])
+            # generate unique labels from assigned label and code number
+            names(labtable)[labdups] <- paste0(names(labtable)[labdups], "_(", labtable[labdups], ")")
+          }
+
           data[, i] <- factor(data[, i], levels=labtable,
                               labels=names(labtable))
           # else generate labels from codes
         } else if (generate.factors) {
           names(varunique) <- as.character(varunique)
           gen.lab  <- sort(c(varunique[!varunique %in% labtable], labtable))
-
+          
           data[, i] <- factor(data[, i], levels=gen.lab,
                               labels=names(gen.lab))
+
         } else {
           warning(paste0("\n  ",vnames[i], ":\n  Missing factor labels - no labels assigned.\n  Set option generate.factors=T to generate labels."))
         }
       }
     }
   }
-  
+
   if (add.rownames) {
     rownames(data) <- data[[1]]
     data[[1]] <- NULL

diff --git a/R/save.R b/R/save.R
@@ -31,8 +31,8 @@
 #'  hexcode.
 #' @param convert.dates \emph{logical.} If \code{TRUE}, dates will be converted
 #'  to Stata date time format. Code from \code{foreign::write.dta}
-#' @param convert.underscore \emph{logical.} If \code{TRUE}, in variable names
-#'  dots will be converted to underscores.
+#' @param convert.underscore \emph{logical.} If \code{TRUE}, all non numerics or
+#' non alphabet characters will be converted to underscores.
 #' @param tz \emph{character.} The name of the timezone convert.dates will use.
 #' @param add.rownames \emph{logical.} If \code{TRUE}, a new variable rownames
 #'  will be added to the dta-file.
@@ -146,7 +146,9 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
   rownames(data) <- NULL
 
   if (convert.underscore) {
-    names(data) <- gsub("[^a-zA-Z0-9:]", "_", names(data))
+    names(data) <- gsub("[^a-zA-Z0-9_]", "_", names(data))
+    names(data)[grepl("^[0-9]", names(data))] <-
+      paste0( "_", names(data)[grepl("^[0-9]", names(data))])
   }
 
   filepath <- path.expand(file)
@@ -224,7 +226,7 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
   ff <- sapply(data, is.numeric)
   ii <- sapply(data, is.integer)
   factors <- sapply(data, is.factor)
-  empty <- sapply(data, function(x) all(is.na(x)))
+  empty <- sapply(data, function(x) all(is.na(x) & !is.character(x)))
   ddates <- vartypen == "Date"
 
   # default no compression: numeric as double; integer as long; date as date;
@@ -284,11 +286,7 @@ save.dta13 <- function(data, file, data.label=NULL, time.stamp=TRUE,
     }
   }
 
-
   # str and strL are stored by maximum length of chars in a variable
-  maxchar <- function(x) {
-    max(nchar(x, type="byte")) + 1
-  }
   str.length <- sapply(data[vartypen == "character"], FUN=maxchar)
   str.length[str.length > sstr] <- sstrl
 

diff --git a/R/tools.R b/R/tools.R
@@ -185,7 +185,8 @@ get.label <- function(dat, label.name) {
 
 #' Assign Stata Labels to a Variable
 #'
-#' Assign value labels from a Stata label set to a variable.
+#' Assign value labels from a Stata label set to a variable. If duplicated labels are found, 
+#' unique labels will be generated according the following scheme: "label_(integer code)".
 #'
 #' @param dat \emph{data.frame.} Data.frame created by \code{read.dta13}.
 #' @param var.name \emph{character.} Name of the variable in the data.frame
@@ -210,6 +211,17 @@ set.label <- function(dat, var.name, lang=NA) {
 
   labtable <- get.label(dat, get.label.name(dat, var.name, lang))
 
+  #check for duplicated labels
+  labcount <- table(names(labtable))
+  if(any(labcount > 1)) {
+
+
+    warning(paste0("\n  ",var.name, ":\n  Duplicated factor levels detected - generating unique labels.\n"))
+    labdups <- names(labtable) %in% names(labcount[labcount > 1])
+    # generate unique labels from assigned label and code number
+    names(labtable)[labdups] <- paste0(names(labtable)[labdups], "_(", labtable[labdups], ")")
+  }
+
   return(factor(tmp, levels=labtable,
                 labels=names(labtable))
   )
@@ -228,7 +240,7 @@ set.label <- function(dat, var.name, lang=NA) {
 #' @return Returns an named vector of variable labels
 #' @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de}
 #' @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de}
-#' @aliases varlabel 
+#' @aliases varlabel
 #' @aliases 'varlabel<-'
 NULL
 
@@ -390,3 +402,28 @@ set.lang <- function(dat, lang=NA, generate.factors=FALSE) {
 saveToExport <- function(x) {
   isTRUE(all.equal(x, as.integer(x)))
 }
+
+
+#' Check max char length of data.frame vectors
+#'
+#' Stata requires us to provide the maximum size of a charactervector as every
+#' row is stored in a bit region of this size.
+#'
+#' Ex: If the max chars size is four, _ is no character in this vector:
+#' 1. row: four
+#' 3. row: one_
+#' 4. row: ____
+#'
+#' If a character vector contains only missings or is empty, we will assign it a
+#' value of one, since Stata otherwise cannot handle what we write.
+#'
+#' @param x vector of data frame
+maxchar <- function(x) {
+  z <- max(nchar(x, type="byte"), na.rm = TRUE)
+
+  # Stata does not allow storing a string of size 0
+  if(is.infinite(z) | (z == 0))
+    z <- 1
+
+  z
+}
diff --git a/README.md b/README.md
@@ -35,12 +35,12 @@ users need to install
 
 ```R
 # install.packages("devtools")
-devtools::install_github("sjewo/readstata13", ref="0.8.5")
+devtools::install_github("sjewo/readstata13", ref="0.9.0")
 ```
 
 Older Versions of devtools require a username option:
 ```R
-install_github("readstata13", username="sjewo", ref="0.8.5")
+install_github("readstata13", username="sjewo", ref="0.9.0")
 ```
 
 To install the current development version from github:
@@ -56,6 +56,13 @@ devtools::install_github("sjewo/readstata13", ref="testing")
 [![CRAN Downloads](http://cranlogs.r-pkg.org/badges/readstata13)](https://cran.r-project.org/package=readstata13)
 
 ### Working features
+* [0.9.0] Generate unique factor labels to prevent errors in factor definition
+* [0.9.0] check interrupt for long read. Patch by Giovanni Righi
+* [0.9.0] updates to notes, roxygen and register
+* [0.9.0] fixed size of character length. Bug reported by Yiming (Paul) Li
+* [0.9.0] fix saving characters containing missings. Bug reported by Eivind H. Olsen
+* [0.9.0] adjustments to convert.underscore. Patch by luke-m-olson
+* [0.9.0] alow partial reading of selected rows
 * [0.8.5] fix errors on big-endians systems
 * [0.8.4] fix valgrind errors. converting from dta.write to writestr
 * [0.8.4] fix for empty data label

diff --git a/inst/include/read_dta.h b/inst/include/read_dta.h
@@ -18,6 +18,6 @@
 #ifndef READDTA_H
 #define READDTA_H
 
-Rcpp::List read_dta(FILE * file, const bool missing);
+Rcpp::List read_dta(FILE * file, const bool missing, const Rcpp::IntegerVector selectrows);
 
 #endif
diff --git a/inst/include/read_pre13_dta.h b/inst/include/read_pre13_dta.h
@@ -18,6 +18,6 @@
 #ifndef READPRE13DTA_H
 #define READPRE13DTA_H
 
-Rcpp::List read_pre13_dta(FILE * file, const bool missing);
+Rcpp::List read_pre13_dta(FILE * file, const bool missing, const Rcpp::IntegerVector selectrows);
 
 #endif
diff --git a/man/maxchar.Rd b/man/maxchar.Rd