From c076735dd42313e1d31bd8ab12b353bf1d41b380 Mon Sep 17 00:00:00 2001 From: Jaakko Ruohio Date: Sat, 27 Jan 2024 23:49:20 +0200 Subject: [PATCH] Perfomance improvements, do not materialize StringVectors --- Project.toml | 2 +- src/JMPReader.jl | 10 ++++++++-- src/column.jl | 21 ++++++--------------- src/utils.jl | 42 ++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 55 insertions(+), 20 deletions(-) diff --git a/Project.toml b/Project.toml index b2fcfc0..9525f86 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "JMPReader" uuid = "d9f7e686-cf87-4d12-8d7a-0e9b8c9fba29" authors = ["Jaakko Ruohio "] -version = "0.1.6-DEV" +version = "0.1.6" [deps] CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193" diff --git a/src/JMPReader.jl b/src/JMPReader.jl index 39ae74d..13a0906 100644 --- a/src/JMPReader.jl +++ b/src/JMPReader.jl @@ -3,10 +3,12 @@ module JMPReader export readjmp using Dates: unix2datetime, Date, DateTime -using DataFrames: DataFrame +using DataFrames: DataFrame, select! using CodecZlib: transcode, GzipDecompressor using LibDeflate: gzip_decompress!, Decompressor, LibDeflateErrors, LibDeflateErrors.deflate_insufficient_space using WeakRefStrings: StringVector +using Base.Threads: nthreads, @threads, @spawn +using Base.Iterators: partition include("types.jl") include("constants.jl") @@ -24,9 +26,13 @@ function readjmp(fn::AbstractString) a = read(fn) check_magic(a, fn) info = metadata(a) + deflatebuffer = Vector{UInt8}() alldata = [column_data(a, info, i, deflatebuffer) for i in 1:info.ncols] - return DataFrame(alldata, info.column.names) + names = info.column.names + df = DataFrame(alldata, names) + + return df end end # module JMPReader diff --git a/src/column.jl b/src/column.jl index 6119360..29dab53 100644 --- a/src/column.jl +++ b/src/column.jl @@ -96,11 +96,7 @@ function column_data(data, info, i::Int, deflatebuffer::Vector{UInt8}) (0x01 ≤ dt3 ≤ 0x07 && dt4 == 0x00) width = dt5 io = a[end-info.nrows*width+1:end] - str = StringVector{String}(io, info.nrows) - str.lengths .= width - str.offsets .= [0; cumsum(str.lengths)[begin:end-1]] - str = rstrip.(str, '\0') - str = String.(str) # SubString->String + str = to_str(io, info.nrows, width) return str end @@ -120,14 +116,12 @@ function column_data(data, info, i::Int, deflatebuffer::Vector{UInt8}) else # uncompressed # continue after dt1,...,dt5 were read _read_reals!(raw, offset, UInt8, 5) - hasunits = _read_real!(raw, offset, UInt8) + hasprops = _read_real!(raw, offset, UInt8) _read_reals!(raw, offset, UInt8) n1 = _read_real!(raw, offset, Int64) - if hasunits == 1 && n1 > 0 - _read_real!(raw, offset, Int16) # ?? - _read_real!(raw, offset, Int64) # some length - label = _read_string!(raw, offset, 4) - _read_real!(raw, offset, UInt32) + if hasprops == 1 + # some block that ends in [0xff, 0xff, 0xff, 0xff] + offset[1] = findnext([0xff, 0xff, 0xff, 0xff], raw, offset[1])[end] end _read_real!(raw, offset, UInt16) # n2 as bytes n2 = _read_real!(raw, offset, UInt32) @@ -146,10 +140,7 @@ function column_data(data, info, i::Int, deflatebuffer::Vector{UInt8}) end io = raw[end-sum(widths)+1:end] end - str = StringVector{String}(io, info.nrows) - str.lengths .= widths - str.offsets .= [0; cumsum(UInt64.(widths))[begin:end-1]] - str = String.(str) # materialize + str = to_str(io, info.nrows, widths) return str end end diff --git a/src/utils.jl b/src/utils.jl index b231f60..dacc29c 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -30,7 +30,45 @@ end function check_magic(a, fn) len = length(a) - len ≥ length(MAGIC_JMP) && a[1:length(MAGIC_JMP)] == MAGIC_JMP || throw(ArgumentError("\"$fn\" is not a .jmp file")) - len < 507 && throw(ArgumentError("\"$fn\" truncated?")) + len ≥ length(MAGIC_JMP) && a[1:length(MAGIC_JMP)] == MAGIC_JMP || throw(ArgumentError("Data table appears to have been corrupted, or is not a .jmp file. `$fn` ")) + len < 507 && throw(ArgumentError("Data table appears to have been corrupted. `$fn`")) + nothing +end + +function to_str(buffer, n, lengths::AbstractVector) + str = StringVector{String}(buffer, n) + str.lengths .= lengths + offset = UInt64(0) + @inbounds for i in 1:n + str.offsets[i] = offset + offset += lengths[i] + end + str +end + +function to_str(buffer, n, length::Integer) + str = StringVector{String}(buffer, n) + str.lengths .= length + offset = UInt64(0) + @inbounds for i in 1:n + str.offsets[i] = offset + offset += length + end + rstripnull!(str) + str +end + +""" + rstripnull!(strs::StringVector) + +Remove trailing nulls from `strs`. +""" +function rstripnull!(s::StringVector) + @inbounds for (i, (length, offset)) in enumerate(zip(s.lengths, s.offsets)) + while s.buffer[offset + length] == 0x00 && length > 0 + length -= 1 + end + s.lengths[i] = length + end nothing end \ No newline at end of file