Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix misaligned pages #111

Merged
merged 5 commits into from
Nov 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion tablite/_nimlite/nimlite.nim
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,10 @@ when isMainModule and appType != "lib":
# (path_csv, encoding) = ("/home/ratchet/Documents/dematic/callisto/tests/testing/data/Dealz Poland v1.csv", str2Enc($ENC_UTF8))
# (path_csv, encoding) = ("/home/ratchet/Documents/dematic/tablite/tests/data/floats.csv", str2Enc($ENC_UTF8))
# (path_csv, encoding) = ("/home/ratchet/Documents/dematic/tablite/tests/data/bad_empty.csv", str2Enc($ENC_UTF8))
(path_csv, encoding) = ("/home/ratchet/Documents/dematic/tablite/tests/data/book1.csv", str2Enc($ENC_UTF8))
# (path_csv, encoding) = ("/home/ratchet/Documents/dematic/tablite/tests/data/book1.csv", str2Enc($ENC_UTF8))
(path_csv, encoding) = ("/home/ratchet/Documents/dematic/tablite/tests/data/detect_misalignment.csv", str2Enc($ENC_UTF8))
# (path_csv, encoding) = ("/home/ratchet/Documents/dematic/callisto/tests/testing/data/Ritual B2B orderlines updated.csv", str2Enc($ENC_UTF8))
# (path_csv, encoding) = ("/home/ratchet/Documents/dematic/callisto/tests/testing/data/Ritual B2B orderlines_small.csv", str2Enc($ENC_UTF8))
# (path_csv, encoding) = ("/home/ratchet/Documents/dematic/tablite/tests/data/utf16_test.csv", str2Enc($ENC_UTF16))
# (path_csv, encoding) = ("/home/ratchet/Documents/dematic/tablite/tests/data/win1250_test.csv", str2ConvEnc("Windows-1252"))

Expand Down
2 changes: 0 additions & 2 deletions tablite/_nimlite/numpy.nim
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import std/unicode
# from std/strutils import parseInt, parseFloat
import infertypes

proc writeNumpyHeader*(fh: File, dtype: string, shape: uint): void =
const magic = "\x93NUMPY"
Expand Down
27 changes: 26 additions & 1 deletion tablite/_nimlite/paging.nim
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ type PageType = enum
PG_DATETIME
PG_DATE_SHORT

var none_str = ""

proc collectPageInfo*(
obj: var ReaderObj, fh: var BaseEncodedFile,
guess_dtypes: bool, n_pages: int, row_count: int,
Expand All @@ -31,6 +33,8 @@ proc collectPageInfo*(
else:
ranks = newSeq[Rank](0)



for (row_idx, fields, field_count) in obj.parseCSV(fh):
if row_count >= 0 and row_idx >= (uint row_count):
break
Expand Down Expand Up @@ -58,6 +62,13 @@ proc collectPageInfo*(
if dt == DataTypes.DT_STRING:
longest_str[fidx] = max(uint field.runeLen, longest_str[fidx])

for idx in (fidx+1)..n_pages-1:
# fill missing fields with nones
longest_str[idx] = max(uint none_str.len, longest_str[idx])

if guess_dtypes:
discard ranks[idx].updateRank(addr none_str)

inc n_rows

return (n_rows, longest_str, ranks)
Expand Down Expand Up @@ -220,7 +231,6 @@ proc dumpPageBody*(
inc fidx

var str = fields[idx]
# let fidx = uint idx
var fh = page_file_handlers[fidx]

if not guess_dtypes:
Expand Down Expand Up @@ -278,6 +288,21 @@ proc dumpPageBody*(
break
else: raise newException(Exception, "invalid: " & $dt)

for idx in (fidx+1)..n_pages-1:
var fh = page_file_handlers[idx]

if not guess_dtypes:
fh.writeNumpyUnicode(none_str, longest_str[idx])
else:
let dt = column_dtypes[idx]

case dt:
of PageType.PG_UNICODE:
fh.writeNumpyUnicode(none_str, longest_str[idx])
else:
fh.writePicklePyObj(PY_None, binput)


proc dumpPageFooter*(
n_pages: int, n_rows: uint,
page_file_handlers: var seq[File],
Expand Down
7 changes: 0 additions & 7 deletions tablite/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,6 @@ class Config(object):
when the number of fields (rows x columns) exceed this value,
multiprocessing is used.
"""

BACKEND_NIM = "NIM"
BACKEND_PYTHON = "PYTHON"
BACKEND = os.environ.get("USE_BACKEND", BACKEND_NIM).upper()

assert BACKEND in [BACKEND_NIM, BACKEND_PYTHON]

USE_NIMPORTER = os.environ.get("USE_NIMPORTER", "true").lower() in ["1", "t", "true", "y", "yes"]
ALLOW_CSV_READER_FALLTHROUGH = os.environ.get("ALLOW_CSV_READER_FALLTHROUGH", "true").lower() in ["1", "t", "true", "y", "yes"]

Expand Down
64 changes: 1 addition & 63 deletions tablite/file_reader_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,18 +80,8 @@ def __init__(
self.c = self._call_1
else:
self.c = self._call_2
elif not any(openings + closures):
self.c = self._call_3
else:
try:
# TODO: The regex below needs to be constructed dynamically depending on the inputs.
# fmt: off
self.re = re.compile('([\d\w\s\u4e00-\u9fff]+)(?=,|$)|((?<=\A)|(?<=,))(?=,|$)|(\(.+\)|".+")', "gmu") # noqa <-- Disclaimer: Audrius wrote this.
# fmt: on
self.c = self._call_4
except TypeError:
self.c = self._call_4_slow
# self.c = self._call_3
self.c = self._call_3

def __call__(self, s):
return self.c(s)
Expand All @@ -104,24 +94,6 @@ def _call_2(self, s):

def _call_3(self, s): # looks for qoutes.
words = []
# qoute = False
# ix = 0
# while ix < len(s):
# c = s[ix]
# if c == self.qoute:
# qoute = not qoute
# if qoute:
# ix += 1
# continue
# if c == self.delimiter:
# word, s = s[:ix], s[ix + self._delimiter_length :]
# word = word.lstrip(self.qoute).rstrip(self.qoute)
# words.append(word)
# ix = -1
# ix += 1
# if s:
# s = s.lstrip(self.qoute).rstrip(self.qoute)
# words.append(s)

class MyDialect(csv.Dialect):
delimiter = self.delimiter
Expand All @@ -137,40 +109,6 @@ class MyDialect(csv.Dialect):
words.extend(parsed_words)
return words

def _call_4(self, s): # looks for qoutes, openings and closures.
return self.re.match(s) # TODO - TEST!

def _call_4_slow(self, s):
words = []
qoute = False
ix, depth = 0, 0
while ix < len(s):
c = s[ix]

if c == self.qoute:
qoute = not qoute

if qoute:
ix += 1
continue

if depth == 0 and c == self.delimiter:
word, s = s[:ix], s[ix + self._delimiter_length :]
words.append(word.rstrip(self.qoute).lstrip(self.qoute))
ix = -1
elif c in self.openings:
depth += 1
elif c in self.closures:
depth -= 1
else:
pass
ix += 1

if s:
words.append(s.rstrip(self.qoute).lstrip(self.qoute))
return words


def detect_seperator(text):
"""
:param path: pathlib.Path objects
Expand Down
Loading
Loading