Skip to content

Commit

Permalink
add support for dta formats 120 and 121
Browse files Browse the repository at this point in the history
  • Loading branch information
JanMarvin committed Aug 4, 2024
1 parent 5320f60 commit 3cbfaab
Show file tree
Hide file tree
Showing 10 changed files with 108 additions and 17 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,4 @@ LinkingTo: Rcpp
ByteCompile: yes
Suggests: testthat
Encoding: UTF-8
RoxygenNote: 7.2.3
RoxygenNote: 7.3.2
10 changes: 8 additions & 2 deletions R/read.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
#
# Copyright (C) 2014-2021 Jan Marvin Garbuszus and Sebastian Jeworutzki
# Copyright (C) 2014-2024 Jan Marvin Garbuszus and Sebastian Jeworutzki
# Copyright (C) of 'convert.dates' and 'missing.types' Thomas Lumley
#
# This program is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -29,7 +29,7 @@
#' "label_(integer code)".
#' @param encoding \emph{character.} Strings can be converted from Windows-1252
#' or UTF-8 to system encoding. Options are "latin1" or "UTF-8" to specify
#' target encoding explicitly. Stata 14, 15 and 16 files are UTF-8 encoded and
#' target encoding explicitly. Since Stata 14 files are UTF-8 encoded and
#' may contain strings which can't be displayed in the current locale.
#' Set encoding=NULL to stop reencoding.
#' @param fromEncoding \emph{character.} We expect strings to be encoded as
Expand Down Expand Up @@ -93,6 +93,11 @@
#'
#' Reading dta-files of older and newer versions than 13 was introduced
#' with version 0.8.
#'
#' Stata 18 introduced alias variables. Alias variables are currently ignored
#' when reading the file. The format was added to the package without access
#' to Stata 18.
#'
#' @return The function returns a data.frame with attributes. The attributes
#' include
#' \describe{
Expand Down Expand Up @@ -212,6 +217,7 @@ read.dta13 <- function(file, convert.factors = TRUE, generate.factors=FALSE,

sstr <- 2045
sstrl <- 32768
salias <- 65525
sdouble <- 65526
sfloat <- 65527
slong <- 65528
Expand Down
3 changes: 1 addition & 2 deletions R/readstata13.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,9 @@
#'
#' @name readstata13
#' @aliases readstata13-package
#' @docType package
#' @useDynLib readstata13, .registration = TRUE
#' @import Rcpp
#' @note If you catch a bug, please do not sue us, we do not have any money.
#' @seealso \code{\link[foreign]{read.dta}} and \code{memisc} for dta files from
#' Stata Versions < 13
NULL
"_PACKAGE"
Binary file added inst/extdata/myproject2.dtas
Binary file not shown.
4 changes: 3 additions & 1 deletion inst/include/readstata.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (C) 2015-2017 Jan Marvin Garbuszus and Sebastian Jeworutzki
* Copyright (C) 2015-2024 Jan Marvin Garbuszus and Sebastian Jeworutzki
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
Expand Down Expand Up @@ -155,6 +155,8 @@ inline Rcpp::IntegerVector calc_rowlength(Rcpp::IntegerVector vartype) {
case STATA_STRL:
rlen(i) = 8;
break;
case STATA_ALIAS: // 0
break;
default:
rlen(i) = type;
break;
Expand Down
3 changes: 2 additions & 1 deletion inst/include/statadefines.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (C) 2015 Jan Marvin Garbuszus and Sebastian Jeworutzki
* Copyright (C) 2015-2023 Jan Marvin Garbuszus and Sebastian Jeworutzki
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
Expand Down Expand Up @@ -53,6 +53,7 @@
#define STATA_INT 65528
#define STATA_FLOAT 65527
#define STATA_DOUBLE 65526
#define STATA_ALIAS 65525

#define STATA_STR 2045
#define STATA_SHORT_STR 244
Expand Down
6 changes: 5 additions & 1 deletion man/read.dta13.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

15 changes: 14 additions & 1 deletion src/read_data.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (C) 2014-2018 Jan Marvin Garbuszus and Sebastian Jeworutzki
* Copyright (C) 2014-2024 Jan Marvin Garbuszus and Sebastian Jeworutzki
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
Expand Down Expand Up @@ -46,6 +46,12 @@ List read_data(FILE * file,
SET_VECTOR_ELT(df, i, IntegerVector(no_init(nn)));
break;

// return correct column size and create a warning
case STATA_ALIAS:
SET_VECTOR_ELT(df, i, CharacterVector(no_init(nn)));
Rf_warning("File contains unhandled alias variable in column: %d", i + 1);
break;

default:
SET_VECTOR_ELT(df, i, CharacterVector(no_init(nn)));
break;
Expand Down Expand Up @@ -166,6 +172,7 @@ List read_data(FILE * file,
break;
}
case 118:
case 120:
{
int16_t v = 0;
int64_t o = 0, z = 0;
Expand Down Expand Up @@ -193,6 +200,7 @@ List read_data(FILE * file,
break;
}
case 119:
case 121:
{
int32_t v = 0;
int64_t o = 0, z = 0;
Expand Down Expand Up @@ -221,8 +229,13 @@ List read_data(FILE * file,
}
}
break;
}
case STATA_ALIAS:
{
break; // do nothing
}
// case < 0:
// case STATA_ALIAS
default:
{
// skip to the next valid case
Expand Down
21 changes: 13 additions & 8 deletions src/read_dta.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (C) 2014-2023 Jan Marvin Garbuszus and Sebastian Jeworutzki
* Copyright (C) 2014-2024 Jan Marvin Garbuszus and Sebastian Jeworutzki
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
Expand Down Expand Up @@ -38,7 +38,7 @@ List read_dta(FILE * file,
*/

int8_t fversion = 117L; //f = first
int8_t lversion = 119L; //l = last
int8_t lversion = 121L; //l = last

std::string version(3, '\0');
readstring(version, file, version.size());
Expand Down Expand Up @@ -74,6 +74,8 @@ List read_dta(FILE * file,
break;
case 118:
case 119:
case 120:
case 121:
nvarnameslen = 129;
nformatslen = 57;
nvalLabelslen = 129;
Expand Down Expand Up @@ -106,9 +108,9 @@ List read_dta(FILE * file,
*/

uint32_t k = 0;
if (release < 119)
if (release < 119 || release == 120)
k = readbin((uint16_t)k, file, swapit);
if (release == 119)
if (release == 119 || release == 121)
k = readbin(k, file, swapit);

//</K>
Expand All @@ -123,7 +125,7 @@ List read_dta(FILE * file,

if (release == 117)
n = readbin((uint32_t)n, file, swapit);
if ((release == 118) | (release == 119))
if ((release >= 118) && (release <= 121))
n = readbin(n, file, swapit);

//</N>
Expand All @@ -146,7 +148,7 @@ List read_dta(FILE * file,

if (release == 117)
ndlabel = readbin((int8_t)ndlabel, file, swapit);
if ((release == 118) | (release == 119))
if ((release >= 118) && (release <= 121))
ndlabel = readbin(ndlabel, file, swapit);

std::string datalabel(ndlabel, '\0');
Expand Down Expand Up @@ -224,6 +226,7 @@ List read_dta(FILE * file,
* vartypes.
* 0-2045: strf (String: Max length 2045)
* 32768: strL (long String: Max length 2 billion)
* 65525: alias
* 65526: double
* 65527: float
* 65528: long
Expand Down Expand Up @@ -274,9 +277,9 @@ List read_dta(FILE * file,
{
uint32_t nsortlist = 0;

if ((release == 117) | (release == 118))
if ((release == 117) || (release == 118) || (release == 120))
nsortlist = readbin((uint16_t)nsortlist, file, swapit);
if (release == 119)
if (release == 119 || release == 121)
nsortlist = readbin(nsortlist, file, swapit);

sortlist[i] = nsortlist;
Expand Down Expand Up @@ -530,6 +533,8 @@ List read_dta(FILE * file,
}
case 118:
case 119:
case 120:
case 121:
{
uint32_t v = 0;
uint64_t o = 0;
Expand Down
61 changes: 61 additions & 0 deletions tests/testthat/test_read.R
Original file line number Diff line number Diff line change
Expand Up @@ -248,3 +248,64 @@ test_that("various datetime conversions", {
dddates <- read.dta13(datetime, convert.dates = TRUE)
expect_true(all.equal(dd, dddates, check.attributes = FALSE))
})

test_that("reading file format 120 works", {

fl <- system.file("extdata", "myproject2.dtas", package="readstata13")

tmp <- tempdir()

fls <- unzip(fl, exdir = tmp)

# data name, dta file name, dta version
data_fram <- strsplit(readLines(fls[1])[-c(1:2)], " ")
data_fram <- as.data.frame(do.call("rbind", data_fram))

expect_equal(data_fram$V1, c("persons", "counties"))

# read dtas
dtas <- fls[tools::file_ext(fls) == "dta"]
expect_equal(basename(dtas), paste0(data_fram$V2, ".dta"))

expect_warning(
df1 <- read.dta13(dtas[1]),
"File contains unhandled alias variable in column: 5"
)
df2 <- read.dta13(dtas[2], convert.factors = FALSE)

expect_equal(attr(df1, "version"), as.integer(data_fram$V3[1]))
expect_equal(attr(df2, "version"), as.integer(data_fram$V3[2]))

# backup order
nams <- names(df1)

# merge: fralias_from in attr(df1, "expansion.fields") tells what to merge
df <- merge(
df1[-which(names(df1) == "median")],
df2,
by = "countyid",
all.x = TRUE
)

# update names
as_name <- attr(df1, "expansion.fields")[[16]]
nams2 <- names(df)
nams2[nams2 == as_name[3]] <- as_name[1]
names(df) <- nams2

# resore expected order
df <- df[nams]

# restore order
df <- df[order(df$personid), ]

expect_equal(
df$personid, 1:20
)

expect_equal(
c("personid", "countyid", "income", "counties", "median", "ratio"),
names(df)
)

})

0 comments on commit 3cbfaab

Please sign in to comment.