Merge pull request #143 from realratchet/master

do not skip headers
root-11 · Mar 6, 2024 · 22b63ac · 22b63ac
2 parents 0e80b06 + 7b96d78
commit 22b63ac
Show file tree

Hide file tree

Showing 10 changed files with 52 additions and 68 deletions.
diff --git a/nimlite/funcs/text_reader/csvparse.nim b/nimlite/funcs/text_reader/csvparse.nim
@@ -278,19 +278,15 @@ proc checkSkipEmpty*(skipEmpty: SkipEmpty, fields: ptr seq[string], fieldCount:
                 return true
         return false
 
-proc readColumns*(path: string, encoding: FileEncoding, dialect: Dialect, rowOffset: uint, skipEmpty: SkipEmpty): (seq[string], uint) =
+proc readColumns*(path: string, encoding: FileEncoding, dialect: Dialect, rowOffset: uint): seq[string] =
     let fh = newFile(path, encoding)
     var obj = newReaderObj(dialect)
-    var skippedRows = 0u
 
     try:
         fh.setFilePos(int64 rowOffset, fspSet)
 
         for (idxRow, fields, fieldCount) in obj.parseCSV(fh):
-            if skipEmpty.checkSkipEmpty(fields, fieldCount):
-                inc skippedRows
-                continue
-            return (fields[0..<fieldCount], skippedRows)
+            return fields[0..<fieldCount]
     finally:
         fh.close()
 

diff --git a/nimlite/funcs/text_reader/pylayer.nim b/nimlite/funcs/text_reader/pylayer.nim
@@ -44,9 +44,9 @@ proc getHeaders*(
     path: string, encoding: FileEncoding,
     headerRowIndex: uint, lineCount: int,
     newline: char, delimiter: char,
-    textQualifier: char, stripLeadingAndTailingWhitespace: bool, skipEmpty: SkipEmpty,
+    textQualifier: char, stripLeadingAndTailingWhitespace: bool,
     quoting: Quoting
 ): seq[seq[string]] =
     let dialect = makeDialect()
 
-    return getHeaders(path, encoding, dialect, skipEmpty, headerRowIndex, lineCount)
+    return getHeaders(path, encoding, dialect, headerRowIndex, lineCount)
diff --git a/nimlite/funcs/text_reader/text_reader.nim b/nimlite/funcs/text_reader/text_reader.nim
@@ -112,7 +112,7 @@ proc textReaderTask*(task: TaskArgs, page_info: PageInfo): seq[nimpy.PyObject] =
     finally:
         fh.close()
 
-proc getHeaders*(path: string, encoding: FileEncoding, dia: Dialect, skipEmpty: SkipEmpty, headerRowIndex: uint, lineCount: int): seq[seq[string]] =
+proc getHeaders*(path: string, encoding: FileEncoding, dia: Dialect, headerRowIndex: uint, lineCount: int): seq[seq[string]] =
     let fh = newFile(path, encoding)
     var obj = newReaderObj(dia)
 
@@ -122,9 +122,6 @@ proc getHeaders*(path: string, encoding: FileEncoding, dia: Dialect, skipEmpty:
         var headers = newSeqOfCap[seq[string]](lineCount)
 
         for (idxRow, fields, fieldCount) in obj.parseCSV(fh):
-            if skipEmpty.checkSkipEmpty(fields, fieldCount):
-                continue
-
             if linesToSkip > 0:
                 dec linesToSkip
                 continue
@@ -162,7 +159,7 @@ proc importTextFile*(
         createDir(dirname)
 
     if newlines > 0 and newlines > headerRowIndex:
-        let (firstLine, skippedLines) = readColumns(path, encoding, dia, newlineOffsets[headerRowIndex], skipEmpty)
+        let firstLine = readColumns(path, encoding, dia, newlineOffsets[headerRowIndex])
 
         var fields = newSeq[string](0)
 
@@ -201,8 +198,6 @@ proc importTextFile*(
                 if name in impColumns:
                     {uint ix: name}
 
-        echo fieldRelation
-
         let importFields = collect: (for k in fieldRelation.keys: k)
         let importFieldNames = collect: (for v in fieldRelation.values: v)
 
@@ -223,7 +218,7 @@ proc importTextFile*(
 
                 {unq: fieldRelationInv[name]}
 
-        let offsetRow = (if firstRowHasHeaders: 1 else: 0) + int (headerRowIndex + skippedLines)
+        let offsetRow = (if firstRowHasHeaders: 1 else: 0) + int headerRowIndex
 
         var pageIdx: uint32 = 1
         var rowIdx: uint = uint optStart + offsetRow

diff --git a/nimlite/libnimlite.nim b/nimlite/libnimlite.nim
@@ -77,7 +77,7 @@ when isLib:
         newline: string, delimiter: string, text_qualifier: string,
         strip_leading_and_tailing_whitespace: bool,
         quoting: string,
-        header_row_index: uint, linecount: int, skip_empty: SkipEmpty): seq[seq[string]] {.exportpy.} =
+        header_row_index: uint, linecount: int): seq[seq[string]] {.exportpy.} =
         var arg_encoding = str2Enc(encoding)
         var arg_newline = (if newline.len == 1: newline[0] else: raise newException(Exception, "'newline' not a char"))
         var arg_delimiter = (if delimiter.len == 1: delimiter[0] else: raise newException(Exception, "'delimiter' not a char"))
@@ -93,8 +93,7 @@ when isLib:
             delimiter = arg_delimiter,
             textQualifier = arg_text_qualifier,
             stripLeadingAndTailingWhitespace = strip_leading_and_tailing_whitespace,
-            quoting = arg_quoting,
-            skipEmpty = skip_empty
+            quoting = arg_quoting
         )
 
         return headers

diff --git a/nimlite/libnimlite.pyi b/nimlite/libnimlite.pyi
@@ -5,9 +5,11 @@ def text_reader_task(path,  encoding,  dia_delimiter,  dia_quotechar,  dia_escap
 def text_reader(pid, path, encoding, columns, first_row_has_headers, header_row_index, start, limit, guess_datatypes, newline, delimiter, text_qualifier, strip_leading_and_tailing_whitespace, skip_empty, page_size, quoting):
     pass
 
+
 def get_headers(path, encoding, newline, delimiter, text_qualifier, strip_leading_and_tailing_whitespace, page_size, quoting, header_row_index, linecount):
     pass
 
+
 def collect_column_select_info(table, cols, dir_pid, pbar):
     pass
 
@@ -25,4 +27,4 @@ def repaginate(column):
 
 
 def collect_text_reader_page_info_task(task_info, task):
-    pass
+    pass
diff --git a/tablite/import_utils.py b/tablite/import_utils.py
@@ -198,20 +198,12 @@ def excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=
 
     worksheet = book[sheet]
     fixup_worksheet(worksheet)
-    skipped_rows = 0
 
     try:
         it_header = worksheet.iter_rows(min_row=header_row_index + 1)
         while True:
             # get the first row to know our headers or the number of columns
             row = [c.value for c in next(it_header)]
-
-            if skip_empty == "ALL" and all(v is None for v in row):
-                skipped_rows = skipped_rows + 1
-                continue
-            elif skip_empty == "ANY" and any(v is None for v in row):
-                skipped_rows = skipped_rows + 1
-                continue
             break
         fields = [str(c) if c is not None else "" for c in row] # excel is offset by 1
     except StopIteration:
@@ -239,7 +231,7 @@ def excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=
             field_dict[unique_name(k, field_dict.keys())] = i
 
     # calculate our data rows iterator offset
-    it_offset = start + (1 if first_row_has_headers else 0) + (header_row_index + skipped_rows) + 1
+    it_offset = start + (1 if first_row_has_headers else 0) + header_row_index + 1
 
     # attempt to fetch number of rows in the sheet
     total_rows = worksheet.max_row
@@ -379,8 +371,8 @@ def ods_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=No
         fn_filter = any if skip_empty == "ALL" else all # this is intentional
         data = [
             row
-            for row in data
-            if fn_filter(not (v is None or isinstance(v, str) and len(v) == 0) for v in row)
+            for ridx, row in enumerate(data)
+            if ridx < header_row_index + (1 if first_row_has_headers else 0) or fn_filter(not (v is None or isinstance(v, str) and len(v) == 0) for v in row)
         ]
 
     data = np.array(data, dtype=np.object_) # cast back to numpy array for slicing but don't try to convert datatypes

diff --git a/tablite/nimlite.py b/tablite/nimlite.py
@@ -41,7 +41,7 @@ def get_headers(
     *,
     header_row_index: int=0,
     newline: str='\n', delimiter: str=',', text_qualifier: str='"',
-    quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool=True, skip_empty: ValidSkipEmpty="NONE",
+    quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool=True,
     linecount: int = 10
 ) -> list[list[str]]:
     return nl.get_headers(
@@ -51,8 +51,7 @@ def get_headers(
             strip_leading_and_tailing_whitespace=strip_leading_and_tailing_whitespace,
             header_row_index=header_row_index,
             quoting=quoting,
-            linecount=linecount,
-            skip_empty=skip_empty
+            linecount=linecount
         )
 
 def text_reader(

diff --git a/tablite/utils.py b/tablite/utils.py
@@ -6,7 +6,7 @@
 from pathlib import Path
 import numpy as np
 import ast
-from datetime import datetime, date, time, timedelta, timezone  # noqa
+from datetime import datetime, date, time, timedelta  # noqa
 from itertools import compress
 import string
 import random

diff --git a/tablite/version.py b/tablite/version.py
@@ -1,3 +1,3 @@
-major, minor, patch = 2023, 10, 4
+major, minor, patch = 2023, 10, 5
 __version_info__ = (major, minor, patch)
 __version__ = ".".join(str(i) for i in __version_info__)
diff --git a/tests/test_filereader_formats.py b/tests/test_filereader_formats.py
@@ -754,7 +754,7 @@ def test_filereader_with_empties_text():
     root_path = Path(__file__).parent / "data"
     path = root_path / "with_empty_lines.csv"
     assert path.exists()
-    table = Table.from_file(path, text_qualifier=None, skip_empty="NONE")
+    table = Table.from_file(path, text_qualifier=None, skip_empty="NONE",)
 
     assert len(table) == 8
     assert table.to_dict() == {
@@ -768,44 +768,45 @@ def test_filereader_with_empties_text():
 
     table = Table.from_file(path, text_qualifier=None, skip_empty="ALL")
 
-    assert len(table) == 2
+    assert len(table) == 3
     assert table.to_dict() == {
-        'a': [0, None],
-        'b': [1, None],
-        'c': [2, None],
-        'd': [3, 9],
-        'e': [4, None],
-        'f': [5, None]
+        '':   ['a', '0', ''],
+        '_1': ['b', '1', ''],
+        '_2': ['c', '2', ''],
+        '_3': ['d', '3', '9'],
+        '_4': ['e', '4', ''],
+        '_5': ['f', '5', '']
     }
 
     table = Table.from_file(path, text_qualifier=None, skip_empty="ANY")
 
-    assert len(table) == 1
+    assert len(table) == 2
     assert table.to_dict() == {
-        'a': [0],
-        'b': [1],
-        'c': [2],
-        'd': [3],
-        'e': [4],
-        'f': [5]
+        '':   ['a', '0'],
+        '_1': ['b', '1'],
+        '_2': ['c', '2'],
+        '_3': ['d', '3'],
+        '_4': ['e', '4'],
+        '_5': ['f', '5']
     }
 
+
 def test_filereader_with_empties_excel():
     root_path = Path(__file__).parent / "data"
-    
+
     fnames = (
         "with_empty_lines.xlsx",
         "with_empty_lines.ods",
     )
-    
+
     for fname in fnames:
         path = root_path / fname
         assert path.exists()
         table = Table.from_file(path, skip_empty="NONE", sheet="with_empty_lines")
 
         assert len(table) == 7
         assert table.to_dict() == {
-            '': [None, 'a', None, None, 0, None, None],
+            '':   [None, 'a', None, None, 0, None, None],
             '_1': [None, 'b', None, None, 1, None, None],
             '_2': [None, 'c', None, None, 2, None, None],
             '_3': [None, 'd', None, None, 3, None, 9],
@@ -815,24 +816,24 @@ def test_filereader_with_empties_excel():
 
         table = Table.from_file(path, skip_empty="ALL", sheet="with_empty_lines")
 
-        assert len(table) == 2
+        assert len(table) == 3
         assert table.to_dict() == {
-            'a': [0, None],
-            'b': [1, None],
-            'c': [2, None],
-            'd': [3, 9],
-            'e': [4, None],
-            'f': [5, None]
+            '':    ['a', 0, None],
+            '_1':  ['b', 1, None],
+            '_2':  ['c', 2, None],
+            '_3':  ['d', 3, 9],
+            '_4':  ['e', 4, None],
+            '_5':  ['f', 5, None]
         }
 
         table = Table.from_file(path, skip_empty="ANY", sheet="with_empty_lines")
 
-        assert len(table) == 1
+        assert len(table) == 2
         assert table.to_dict() == {
-            'a': [0],
-            'b': [1],
-            'c': [2],
-            'd': [3],
-            'e': [4],
-            'f': [5]
+            '':   ['a', 0],
+            '_1': ['b', 1],
+            '_2': ['c', 2],
+            '_3': ['d', 3],
+            '_4': ['e', 4],
+            '_5': ['f', 5]
         }