root-11 · realratchet · Nov 15, 2023 · Nov 15, 2023 · Nov 15, 2023 · Nov 15, 2023
diff --git a/tablite/_nimlite/nimlite.nim b/tablite/_nimlite/nimlite.nim
@@ -255,7 +255,10 @@ when isMainModule and appType != "lib":
             # (path_csv, encoding) = ("/home/ratchet/Documents/dematic/callisto/tests/testing/data/Dealz Poland v1.csv", str2Enc($ENC_UTF8))
             # (path_csv, encoding) = ("/home/ratchet/Documents/dematic/tablite/tests/data/floats.csv", str2Enc($ENC_UTF8))
             # (path_csv, encoding) = ("/home/ratchet/Documents/dematic/tablite/tests/data/bad_empty.csv", str2Enc($ENC_UTF8))
-            (path_csv, encoding) = ("/home/ratchet/Documents/dematic/tablite/tests/data/book1.csv", str2Enc($ENC_UTF8))
+            # (path_csv, encoding) = ("/home/ratchet/Documents/dematic/tablite/tests/data/book1.csv", str2Enc($ENC_UTF8))
+            (path_csv, encoding) = ("/home/ratchet/Documents/dematic/tablite/tests/data/detect_misalignment.csv", str2Enc($ENC_UTF8))
+            # (path_csv, encoding) = ("/home/ratchet/Documents/dematic/callisto/tests/testing/data/Ritual B2B orderlines updated.csv", str2Enc($ENC_UTF8))
+            # (path_csv, encoding) = ("/home/ratchet/Documents/dematic/callisto/tests/testing/data/Ritual B2B orderlines_small.csv", str2Enc($ENC_UTF8))
             # (path_csv, encoding) = ("/home/ratchet/Documents/dematic/tablite/tests/data/utf16_test.csv", str2Enc($ENC_UTF16))
             # (path_csv, encoding) = ("/home/ratchet/Documents/dematic/tablite/tests/data/win1250_test.csv", str2ConvEnc("Windows-1252"))
 

diff --git a/tablite/_nimlite/numpy.nim b/tablite/_nimlite/numpy.nim
@@ -1,6 +1,4 @@
 import std/unicode
-# from std/strutils import parseInt, parseFloat
-import infertypes
 
 proc writeNumpyHeader*(fh: File, dtype: string, shape: uint): void =
     const magic = "\x93NUMPY"

diff --git a/tablite/_nimlite/paging.nim b/tablite/_nimlite/paging.nim
@@ -12,6 +12,8 @@ type PageType = enum
     PG_DATETIME
     PG_DATE_SHORT
 
+var none_str = ""
+
 proc collectPageInfo*(
         obj: var ReaderObj, fh: var BaseEncodedFile,
         guess_dtypes: bool, n_pages: int, row_count: int,
@@ -31,6 +33,8 @@ proc collectPageInfo*(
     else:
         ranks = newSeq[Rank](0)
 
+
+
     for (row_idx, fields, field_count) in obj.parseCSV(fh):
         if row_count >= 0 and row_idx >= (uint row_count):
             break
@@ -58,6 +62,13 @@ proc collectPageInfo*(
                 if dt == DataTypes.DT_STRING:
                     longest_str[fidx] = max(uint field.runeLen, longest_str[fidx])
 
+        for idx in (fidx+1)..n_pages-1:
+            # fill missing fields with nones
+            longest_str[idx] = max(uint none_str.len, longest_str[idx])
+
+            if guess_dtypes:
+                discard ranks[idx].updateRank(addr none_str)
+
         inc n_rows
 
     return (n_rows, longest_str, ranks)
@@ -220,7 +231,6 @@ proc dumpPageBody*(
             inc fidx
 
             var str = fields[idx]
-            # let fidx = uint idx
             var fh = page_file_handlers[fidx]
 
             if not guess_dtypes:
@@ -278,6 +288,21 @@ proc dumpPageBody*(
                             break
                     else: raise newException(Exception, "invalid: " & $dt)
 
+        for idx in (fidx+1)..n_pages-1:
+            var fh = page_file_handlers[idx]
+
+            if not guess_dtypes:
+                fh.writeNumpyUnicode(none_str, longest_str[idx])
+            else:
+                let dt = column_dtypes[idx]
+
+                case dt:
+                    of PageType.PG_UNICODE:
+                        fh.writeNumpyUnicode(none_str, longest_str[idx])
+                    else:
+                        fh.writePicklePyObj(PY_None, binput)
+
+
 proc dumpPageFooter*(
     n_pages: int, n_rows: uint,
     page_file_handlers: var seq[File],

diff --git a/tablite/config.py b/tablite/config.py
@@ -32,13 +32,6 @@ class Config(object):
     when the number of fields (rows x columns) exceed this value,
     multiprocessing is used.
     """
-
-    BACKEND_NIM = "NIM"
-    BACKEND_PYTHON = "PYTHON"
-    BACKEND = os.environ.get("USE_BACKEND", BACKEND_NIM).upper()
-
-    assert BACKEND in [BACKEND_NIM, BACKEND_PYTHON]
-
     USE_NIMPORTER = os.environ.get("USE_NIMPORTER", "true").lower() in ["1", "t", "true", "y", "yes"]
     ALLOW_CSV_READER_FALLTHROUGH = os.environ.get("ALLOW_CSV_READER_FALLTHROUGH", "true").lower() in ["1", "t", "true", "y", "yes"]
 

diff --git a/tablite/file_reader_utils.py b/tablite/file_reader_utils.py
@@ -80,18 +80,8 @@ def __init__(
                 self.c = self._call_1
             else:
                 self.c = self._call_2
-        elif not any(openings + closures):
-            self.c = self._call_3
         else:
-            try:
-                # TODO: The regex below needs to be constructed dynamically depending on the inputs.
-                # fmt: off
-                self.re = re.compile('([\d\w\s\u4e00-\u9fff]+)(?=,|$)|((?<=\A)|(?<=,))(?=,|$)|(\(.+\)|".+")', "gmu")  # noqa <-- Disclaimer: Audrius wrote this.
-                # fmt: on
-                self.c = self._call_4
-            except TypeError:
-                self.c = self._call_4_slow
-        # self.c = self._call_3
+            self.c = self._call_3
 
     def __call__(self, s):
         return self.c(s)
@@ -104,24 +94,6 @@ def _call_2(self, s):
 
     def _call_3(self, s):  # looks for qoutes.
         words = []
-        # qoute = False
-        # ix = 0
-        # while ix < len(s):
-        #     c = s[ix]
-        #     if c == self.qoute:
-        #         qoute = not qoute
-        #     if qoute:
-        #         ix += 1
-        #         continue
-        #     if c == self.delimiter:
-        #         word, s = s[:ix], s[ix + self._delimiter_length :]
-        #         word = word.lstrip(self.qoute).rstrip(self.qoute)
-        #         words.append(word)
-        #         ix = -1
-        #     ix += 1
-        # if s:
-        #     s = s.lstrip(self.qoute).rstrip(self.qoute)
-        #     words.append(s)
 
         class MyDialect(csv.Dialect):
             delimiter = self.delimiter
@@ -137,40 +109,6 @@ class MyDialect(csv.Dialect):
         words.extend(parsed_words)
         return words
 
-    def _call_4(self, s):  # looks for qoutes, openings and closures.
-        return self.re.match(s)  # TODO - TEST!
-
-    def _call_4_slow(self, s):
-        words = []
-        qoute = False
-        ix, depth = 0, 0
-        while ix < len(s):
-            c = s[ix]
-
-            if c == self.qoute:
-                qoute = not qoute
-
-            if qoute:
-                ix += 1
-                continue
-
-            if depth == 0 and c == self.delimiter:
-                word, s = s[:ix], s[ix + self._delimiter_length :]
-                words.append(word.rstrip(self.qoute).lstrip(self.qoute))
-                ix = -1
-            elif c in self.openings:
-                depth += 1
-            elif c in self.closures:
-                depth -= 1
-            else:
-                pass
-            ix += 1
-
-        if s:
-            words.append(s.rstrip(self.qoute).lstrip(self.qoute))
-        return words
-
-
 def detect_seperator(text):
     """
     :param path: pathlib.Path objects