Skip to content

Commit

Permalink
Merge pull request #143 from realratchet/master
Browse files Browse the repository at this point in the history
do not skip headers
  • Loading branch information
realratchet authored Mar 6, 2024
2 parents 0e80b06 + 7b96d78 commit 22b63ac
Show file tree
Hide file tree
Showing 10 changed files with 52 additions and 68 deletions.
8 changes: 2 additions & 6 deletions nimlite/funcs/text_reader/csvparse.nim
Original file line number Diff line number Diff line change
Expand Up @@ -278,19 +278,15 @@ proc checkSkipEmpty*(skipEmpty: SkipEmpty, fields: ptr seq[string], fieldCount:
return true
return false

proc readColumns*(path: string, encoding: FileEncoding, dialect: Dialect, rowOffset: uint, skipEmpty: SkipEmpty): (seq[string], uint) =
proc readColumns*(path: string, encoding: FileEncoding, dialect: Dialect, rowOffset: uint): seq[string] =
let fh = newFile(path, encoding)
var obj = newReaderObj(dialect)
var skippedRows = 0u

try:
fh.setFilePos(int64 rowOffset, fspSet)

for (idxRow, fields, fieldCount) in obj.parseCSV(fh):
if skipEmpty.checkSkipEmpty(fields, fieldCount):
inc skippedRows
continue
return (fields[0..<fieldCount], skippedRows)
return fields[0..<fieldCount]
finally:
fh.close()

Expand Down
4 changes: 2 additions & 2 deletions nimlite/funcs/text_reader/pylayer.nim
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ proc getHeaders*(
path: string, encoding: FileEncoding,
headerRowIndex: uint, lineCount: int,
newline: char, delimiter: char,
textQualifier: char, stripLeadingAndTailingWhitespace: bool, skipEmpty: SkipEmpty,
textQualifier: char, stripLeadingAndTailingWhitespace: bool,
quoting: Quoting
): seq[seq[string]] =
let dialect = makeDialect()

return getHeaders(path, encoding, dialect, skipEmpty, headerRowIndex, lineCount)
return getHeaders(path, encoding, dialect, headerRowIndex, lineCount)
11 changes: 3 additions & 8 deletions nimlite/funcs/text_reader/text_reader.nim
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ proc textReaderTask*(task: TaskArgs, page_info: PageInfo): seq[nimpy.PyObject] =
finally:
fh.close()

proc getHeaders*(path: string, encoding: FileEncoding, dia: Dialect, skipEmpty: SkipEmpty, headerRowIndex: uint, lineCount: int): seq[seq[string]] =
proc getHeaders*(path: string, encoding: FileEncoding, dia: Dialect, headerRowIndex: uint, lineCount: int): seq[seq[string]] =
let fh = newFile(path, encoding)
var obj = newReaderObj(dia)

Expand All @@ -122,9 +122,6 @@ proc getHeaders*(path: string, encoding: FileEncoding, dia: Dialect, skipEmpty:
var headers = newSeqOfCap[seq[string]](lineCount)

for (idxRow, fields, fieldCount) in obj.parseCSV(fh):
if skipEmpty.checkSkipEmpty(fields, fieldCount):
continue

if linesToSkip > 0:
dec linesToSkip
continue
Expand Down Expand Up @@ -162,7 +159,7 @@ proc importTextFile*(
createDir(dirname)

if newlines > 0 and newlines > headerRowIndex:
let (firstLine, skippedLines) = readColumns(path, encoding, dia, newlineOffsets[headerRowIndex], skipEmpty)
let firstLine = readColumns(path, encoding, dia, newlineOffsets[headerRowIndex])

var fields = newSeq[string](0)

Expand Down Expand Up @@ -201,8 +198,6 @@ proc importTextFile*(
if name in impColumns:
{uint ix: name}

echo fieldRelation

let importFields = collect: (for k in fieldRelation.keys: k)
let importFieldNames = collect: (for v in fieldRelation.values: v)

Expand All @@ -223,7 +218,7 @@ proc importTextFile*(

{unq: fieldRelationInv[name]}

let offsetRow = (if firstRowHasHeaders: 1 else: 0) + int (headerRowIndex + skippedLines)
let offsetRow = (if firstRowHasHeaders: 1 else: 0) + int headerRowIndex

var pageIdx: uint32 = 1
var rowIdx: uint = uint optStart + offsetRow
Expand Down
5 changes: 2 additions & 3 deletions nimlite/libnimlite.nim
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ when isLib:
newline: string, delimiter: string, text_qualifier: string,
strip_leading_and_tailing_whitespace: bool,
quoting: string,
header_row_index: uint, linecount: int, skip_empty: SkipEmpty): seq[seq[string]] {.exportpy.} =
header_row_index: uint, linecount: int): seq[seq[string]] {.exportpy.} =
var arg_encoding = str2Enc(encoding)
var arg_newline = (if newline.len == 1: newline[0] else: raise newException(Exception, "'newline' not a char"))
var arg_delimiter = (if delimiter.len == 1: delimiter[0] else: raise newException(Exception, "'delimiter' not a char"))
Expand All @@ -93,8 +93,7 @@ when isLib:
delimiter = arg_delimiter,
textQualifier = arg_text_qualifier,
stripLeadingAndTailingWhitespace = strip_leading_and_tailing_whitespace,
quoting = arg_quoting,
skipEmpty = skip_empty
quoting = arg_quoting
)

return headers
Expand Down
4 changes: 3 additions & 1 deletion nimlite/libnimlite.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,11 @@ def text_reader_task(path, encoding, dia_delimiter, dia_quotechar, dia_escap
def text_reader(pid, path, encoding, columns, first_row_has_headers, header_row_index, start, limit, guess_datatypes, newline, delimiter, text_qualifier, strip_leading_and_tailing_whitespace, skip_empty, page_size, quoting):
pass


def get_headers(path, encoding, newline, delimiter, text_qualifier, strip_leading_and_tailing_whitespace, page_size, quoting, header_row_index, linecount):
pass


def collect_column_select_info(table, cols, dir_pid, pbar):
pass

Expand All @@ -25,4 +27,4 @@ def repaginate(column):


def collect_text_reader_page_info_task(task_info, task):
pass
pass
14 changes: 3 additions & 11 deletions tablite/import_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,20 +198,12 @@ def excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=

worksheet = book[sheet]
fixup_worksheet(worksheet)
skipped_rows = 0

try:
it_header = worksheet.iter_rows(min_row=header_row_index + 1)
while True:
# get the first row to know our headers or the number of columns
row = [c.value for c in next(it_header)]

if skip_empty == "ALL" and all(v is None for v in row):
skipped_rows = skipped_rows + 1
continue
elif skip_empty == "ANY" and any(v is None for v in row):
skipped_rows = skipped_rows + 1
continue
break
fields = [str(c) if c is not None else "" for c in row] # excel is offset by 1
except StopIteration:
Expand Down Expand Up @@ -239,7 +231,7 @@ def excel_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=
field_dict[unique_name(k, field_dict.keys())] = i

# calculate our data rows iterator offset
it_offset = start + (1 if first_row_has_headers else 0) + (header_row_index + skipped_rows) + 1
it_offset = start + (1 if first_row_has_headers else 0) + header_row_index + 1

# attempt to fetch number of rows in the sheet
total_rows = worksheet.max_row
Expand Down Expand Up @@ -379,8 +371,8 @@ def ods_reader(T, path, first_row_has_headers=True, header_row_index=0, sheet=No
fn_filter = any if skip_empty == "ALL" else all # this is intentional
data = [
row
for row in data
if fn_filter(not (v is None or isinstance(v, str) and len(v) == 0) for v in row)
for ridx, row in enumerate(data)
if ridx < header_row_index + (1 if first_row_has_headers else 0) or fn_filter(not (v is None or isinstance(v, str) and len(v) == 0) for v in row)
]

data = np.array(data, dtype=np.object_) # cast back to numpy array for slicing but don't try to convert datatypes
Expand Down
5 changes: 2 additions & 3 deletions tablite/nimlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def get_headers(
*,
header_row_index: int=0,
newline: str='\n', delimiter: str=',', text_qualifier: str='"',
quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool=True, skip_empty: ValidSkipEmpty="NONE",
quoting: ValidQuoting, strip_leading_and_tailing_whitespace: bool=True,
linecount: int = 10
) -> list[list[str]]:
return nl.get_headers(
Expand All @@ -51,8 +51,7 @@ def get_headers(
strip_leading_and_tailing_whitespace=strip_leading_and_tailing_whitespace,
header_row_index=header_row_index,
quoting=quoting,
linecount=linecount,
skip_empty=skip_empty
linecount=linecount
)

def text_reader(
Expand Down
2 changes: 1 addition & 1 deletion tablite/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pathlib import Path
import numpy as np
import ast
from datetime import datetime, date, time, timedelta, timezone # noqa
from datetime import datetime, date, time, timedelta # noqa
from itertools import compress
import string
import random
Expand Down
2 changes: 1 addition & 1 deletion tablite/version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
major, minor, patch = 2023, 10, 4
major, minor, patch = 2023, 10, 5
__version_info__ = (major, minor, patch)
__version__ = ".".join(str(i) for i in __version_info__)
65 changes: 33 additions & 32 deletions tests/test_filereader_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -754,7 +754,7 @@ def test_filereader_with_empties_text():
root_path = Path(__file__).parent / "data"
path = root_path / "with_empty_lines.csv"
assert path.exists()
table = Table.from_file(path, text_qualifier=None, skip_empty="NONE")
table = Table.from_file(path, text_qualifier=None, skip_empty="NONE",)

assert len(table) == 8
assert table.to_dict() == {
Expand All @@ -768,44 +768,45 @@ def test_filereader_with_empties_text():

table = Table.from_file(path, text_qualifier=None, skip_empty="ALL")

assert len(table) == 2
assert len(table) == 3
assert table.to_dict() == {
'a': [0, None],
'b': [1, None],
'c': [2, None],
'd': [3, 9],
'e': [4, None],
'f': [5, None]
'': ['a', '0', ''],
'_1': ['b', '1', ''],
'_2': ['c', '2', ''],
'_3': ['d', '3', '9'],
'_4': ['e', '4', ''],
'_5': ['f', '5', '']
}

table = Table.from_file(path, text_qualifier=None, skip_empty="ANY")

assert len(table) == 1
assert len(table) == 2
assert table.to_dict() == {
'a': [0],
'b': [1],
'c': [2],
'd': [3],
'e': [4],
'f': [5]
'': ['a', '0'],
'_1': ['b', '1'],
'_2': ['c', '2'],
'_3': ['d', '3'],
'_4': ['e', '4'],
'_5': ['f', '5']
}


def test_filereader_with_empties_excel():
root_path = Path(__file__).parent / "data"

fnames = (
"with_empty_lines.xlsx",
"with_empty_lines.ods",
)

for fname in fnames:
path = root_path / fname
assert path.exists()
table = Table.from_file(path, skip_empty="NONE", sheet="with_empty_lines")

assert len(table) == 7
assert table.to_dict() == {
'': [None, 'a', None, None, 0, None, None],
'': [None, 'a', None, None, 0, None, None],
'_1': [None, 'b', None, None, 1, None, None],
'_2': [None, 'c', None, None, 2, None, None],
'_3': [None, 'd', None, None, 3, None, 9],
Expand All @@ -815,24 +816,24 @@ def test_filereader_with_empties_excel():

table = Table.from_file(path, skip_empty="ALL", sheet="with_empty_lines")

assert len(table) == 2
assert len(table) == 3
assert table.to_dict() == {
'a': [0, None],
'b': [1, None],
'c': [2, None],
'd': [3, 9],
'e': [4, None],
'f': [5, None]
'': ['a', 0, None],
'_1': ['b', 1, None],
'_2': ['c', 2, None],
'_3': ['d', 3, 9],
'_4': ['e', 4, None],
'_5': ['f', 5, None]
}

table = Table.from_file(path, skip_empty="ANY", sheet="with_empty_lines")

assert len(table) == 1
assert len(table) == 2
assert table.to_dict() == {
'a': [0],
'b': [1],
'c': [2],
'd': [3],
'e': [4],
'f': [5]
'': ['a', 0],
'_1': ['b', 1],
'_2': ['c', 2],
'_3': ['d', 3],
'_4': ['e', 4],
'_5': ['f', 5]
}

0 comments on commit 22b63ac

Please sign in to comment.