Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed some edge cases in filter pagination #157

Merged
merged 7 commits into from
Apr 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/actions/buildnim/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ runs:
shell: bash
run: |
nimble -y refresh
nimble -y install nimpy
nimble -y install nimpy dotenv
- name: Set Environment Variables
uses: allenevans/[email protected]
with:
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ site/

# personal notes
*scratch*.py
*scratch*.nim

# local confidential data
tests/ndata/*
Expand Down
5 changes: 3 additions & 2 deletions nimlite.nimble
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Package

version = "0.2.1"
version = "0.3.0"
author = "Ratchet"
description = "Utilities for tablite to work with nim"
license = "MIT"
Expand All @@ -10,4 +10,5 @@ license = "MIT"
# Dependencies

requires "nim >= 2.0.0"
requires "nimpy >= 0.2.0"
requires "nimpy >= 0.2.0"
requires "dotenv >= 2.0.0"
220 changes: 0 additions & 220 deletions nimlite/funcs/column_selector.nim
Original file line number Diff line number Diff line change
Expand Up @@ -7,223 +7,3 @@ export toPyObj
export collectColumnSelectInfo
export doSliceConvert
export fromPyObjToDesiredInfos

when isMainModule and appType != "lib":

import std/[os, tables, sugar, sets, sequtils, paths, macros]
import nimpy
from ../nimpyext import `!`
import std/options as opt
import ../pymodules
import ../numpy
import typetraits

proc columnSelect(table: nimpy.PyObject, cols: nimpy.PyObject, tqdm: nimpy.PyObject, dir_pid: Path, TaskManager: nimpy.PyObject): (nimpy.PyObject, nimpy.PyObject) =
# this is nim-only implementation, the library build doesn't need it because we need TaskManager to be used for slices
let TableClass = modules().getType(table)
var pbar = tqdm!(total: 100, desc: "column select")
let colInfoResult = collectColumnSelectInfo(table, cols, string dir_pid, pbar)

if toSeq(colInfoResult.isCorrectType.values).all(proc (x: bool): bool = x):
let tblPassColumns = collect(initTable()):
for (desiredName, desiredInfo) in colInfoResult.desiredColumnMap.pairs():
{desiredName: table[desiredInfo.originalName]}

let tblFailColumns = collect(initTable()):
for desiredName in colInfoResult.failedColumnData:
{desiredName: newSeq[nimpy.PyObject]()}

let tblPass = TableClass!(columns: tblPassColumns)
let tblFail = TableClass!(columns: tblFailColumns)

return (tblPass, tblFail)

template ordered2PyDict(keys: seq[string]): nimpy.PyObject =
let dict = modules().builtins.classes.DictClass!()

for k in keys:
dict[k] = newSeq[nimpy.PyObject]()

dict

var tblPass = TableClass!(columns = colInfoResult.passedColumnData.ordered2PyDict())
var tblFail = TableClass!(columns = colInfoResult.failedColumnData.ordered2PyDict())

var taskListInp = collect:
for i in 0..<colInfoResult.pageCount:
let el = collect(initTable()):
for (name, column) in colInfoResult.columns.pairs:
{name: (column[i], colInfoResult.originalPagesMap[name][i])}
(el, colInfoResult.resColsPass[i], colInfoResult.resColsFail[i])

let tabliteConfig = modules().tablite.modules.config.classes.Config
var pageSize = tabliteConfig.PAGE_SIZE.to(int)
var converted = newSeqOfCap[(seq[(string, nimpy.PyObject)], seq[(string, nimpy.PyObject)])](taskListInp.len)
var pbarStep = 45 / max(taskListInp.len - 1, 1)

for (columns, resPass, resFail) in taskListInp:
converted.add(doSliceConvert(dir_pid, pageSize, columns, colInfoResult.rejectReasonName, resPass, resFail, colInfoResult.desiredColumnMap, colInfoResult.columnNames, colInfoResult.isCorrectType))

discard pbar.update(pbarStep)

proc extendTable(table: var nimpy.PyObject, columns: seq[(string, nimpy.PyObject)]): void {.inline.} =
for (col_name, pg) in columns:
let col = table[col_name]

discard col.pages.append(pg) # can't col.extend because nim is dumb :)

for (pg_pass, pg_fail) in converted:
tblPass.extendTable(pg_pass)
tblFail.extendTable(pg_fail)

discard pbar.update(pbar.total.to(float) - pbar.n.to(float))
discard pbar.close()

return (tblPass, tblFail)

proc newColumnSelectorInfo(column: string, `type`: string, allow_empty: bool, rename: opt.Option[string]): nimpy.PyObject =
let pyDict = modules().builtins.classes.DictClass!(
column: column,
type: `type`,
allow_empty: allow_empty
)

if rename.isNone():
pyDict["rename"] = nil
else:
pyDict["rename"] = rename.get()

return pyDict

let tabliteConfig = modules().tablite.modules.config.classes.Config
let workdir = Path(modules().toStr(tabliteConfig.workdir))
let pid = "nim"
let pagedir = workdir / Path(pid) / Path("pages")

createDir(string pagedir)

tabliteConfig.pid = pid
# tabliteConfig.pageSize = 2
# tabliteConfig.MULTIPROCESSING_MODE = tabliteConfig.FALSE

# let columns = modules().builtins.classes.DictClass!({"A ": @[nimValueToPy(0), nimValueToPy(nil), nimValueToPy(10), nimValueToPy(200)]}.toTable)
# let columns = modules().builtins.classes.DictClass!({"A ": @[1, 22, 333]}.toTable)
# let columns = modules().builtins.classes.DictClass!({"A": @[0, 1]}.toTable)
# let columns = modules().builtins.classes.DictClass!({"A ": @["1", "22", "333", "", "abc"]}.toTable)
# let columns = modules().builtins.classes.DictClass!({"A": @["a", "1", "c"], "B": @["d", "e", "f"]}.toTable)
# let columns = modules().builtins.classes.DictClass!({"A ": @[nimValueToPy("1"), nimValueToPy("222"), nimValueToPy("333"), nimValueToPy(nil), nimValueToPy("abc")]}.toTable)
# let columns = modules().builtins.classes.DictClass!({"A ": @[nimValueToPy(1), nimValueToPy(2.0), nimValueToPy("333"), nimValueToPy("abc")]}.toTable)
# let columns = modules().builtins.classes.DictClass!({"A": @[nimValueToPy(111111), nimValueToPy(222222), nimValueToPy(333333)], "B": @[nimValueToPy(0), nimValueToPy(nil), nimValueToPy(2)]}.toTable)
# let columns = modules().builtins.classes.DictClass!({"A": @[nimValueToPy("0"), nimValueToPy(nil), nimValueToPy("2")], "B": @[nimValueToPy("3"), nimValueToPy(nil), nimValueToPy("4")]}.toTable)
# let columns = modules().builtins.classes.DictClass!({"str": @["1", "0"]})
# let columns = modules().builtins.classes.DictClass!({"float": @[1.0, 0.0]})
let columns = modules().builtins.classes.DictClass!({"date": @[
modules().datetime.classes.DateClass!(2000, 1, 1),
modules().datetime.classes.DateClass!(2000, 1, 2),
]})
# let columns = pymodules.builtins().dict({"str": @[nimValueToPy("abc"), nimValueToPy("efg"), nimValueToPy(nil)]}.toTable)
let table = modules().tablite.classes.TableClass!(columns = columns)
let dirdata = os.getEnv("DATA_DIR", ".")
# let table = modules().tablite.fromFile(dirdata & "/gesaber_data_10k.csv")
# let table = modules().tablite.classes.TableClass.load("/media/ratchet/hdd/tablite/filter_0_false.tpz")

# discard table.show(dtype = true)

let select_cols = modules().builtins.classes.ListClass!(@[
# newColumnSelectorInfo("A ",, "int", true, opt.none[string]()),
# newColumnSelectorInfo("A ",, "float", true, opt.none[string]()),
# newColumnSelectorInfo("A ",, "float", false, opt.none[string]()),
# newColumnSelectorInfo("A ", "bool", false, opt.none[string]()),
# newColumnSelectorInfo("A ",, "str", false, opt.none[string]()),
# newColumnSelectorInfo("A ", "date", false, opt.none[string]()),
# newColumnSelectorInfo("A ", "datetime", false, opt.none[string]()),
# newColumnSelectorInfo("A ", "time", false, opt.none[string]()),
# newColumnSelectorInfo("A",, "int", true, opt.none[string]()),
# newColumnSelectorInfo("B",, "str", true, opt.none[string]()),

# newColumnSelectorInfo("str", "bool", false, opt.some("bool")),
# newColumnSelectorInfo("str",, "int", false, opt.some("int")),
# newColumnSelectorInfo("str",, "float", false, opt.some("float")),
# newColumnSelectorInfo("str",, "str", false, opt.some("str")),

# newColumnSelectorInfo("float", "bool", false, opt.some("bool")),
# newColumnSelectorInfo("float",, "int", false, opt.some("int")),
# newColumnSelectorInfo("float",, "float", false, opt.some("float")),
# newColumnSelectorInfo("float",, "str", false, opt.some("str")),
# newColumnSelectorInfo("float", "date", false, opt.some("date")),
# newColumnSelectorInfo("float", "time", false, opt.some("time")),
# newColumnSelectorInfo("float", "datetime", false, opt.some("datetime")),

# newColumnSelectorInfo("date", "bool", false, opt.some("bool")),
# newColumnSelectorInfo("date",, "int", false, opt.some("int")),
# newColumnSelectorInfo("date",, "float", false, opt.some("float")),
# newColumnSelectorInfo("date",, "str", false, opt.some("str")),
# newColumnSelectorInfo("date", "date", false, opt.some("date")),
# newColumnSelectorInfo("date", "time", false, opt.some("time")),
# newColumnSelectorInfo("date", "datetime", false, opt.some("datetime")),

# newColumnSelectorInfo("str",, "str", true, opt.some("str")),

# newColumnSelectorInfo("A",, "str", false, opt.none[string]()),
# newColumnSelectorInfo("B",, "int", false, opt.none[string]()),

# newColumnSelectorInfo("A", "int", false, opt.none[string]()),

# newColumnSelectorInfo("date", "bool", false, opt.some("bool")),
# newColumnSelectorInfo("date", "int", false, opt.some("int")),
# newColumnSelectorInfo("date", "float", false, opt.some("float")),
# newColumnSelectorInfo("date", "str", false, opt.some("str")),
# newColumnSelectorInfo("date", "date", false, opt.some("date")),
newColumnSelectorInfo("date", "time", false, opt.some("time")),
newColumnSelectorInfo("date", "datetime", false, opt.some("datetime")),

# newColumnSelectorInfo("sale_date", "datetime", false, opt.none[string]()),
# newColumnSelectorInfo("cust_nbr", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Order_Number", "str", false, opt.none[string]()),
# newColumnSelectorInfo("prod_slbl", "str", false, opt.none[string]()),
# newColumnSelectorInfo("cases", "int", false, opt.none[string]()),

# newColumnSelectorInfo("Article code", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Article Description", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Department", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Department Name", "str", false, opt.none[string]()),
# newColumnSelectorInfo("MC", "str", false, opt.none[string]()),
# newColumnSelectorInfo("MC Name", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Season", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Season Name", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Source", "int", false, opt.none[string]()),
# newColumnSelectorInfo("Source Name", "str", false, opt.none[string]()),
# newColumnSelectorInfo("X-site artl status", "int", false, opt.none[string]()),
# newColumnSelectorInfo("X-site artl status desc", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Fragile?", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Inner Type (Current)", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Inner Name (Current)", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Inner Type (STO)", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Units per Case (Current)", "float", false, opt.none[string]()),
# newColumnSelectorInfo("Units per Case (STO)", "float", false, opt.none[string]()),
# newColumnSelectorInfo("Units per pallet\\nLATEST", "float", false, opt.some("Units per pallet")),
# newColumnSelectorInfo("Case L (m)", "float", false, opt.none[string]()),
# newColumnSelectorInfo("Case W (m)", "float", false, opt.none[string]()),
# newColumnSelectorInfo("Case H (m)", "float", false, opt.none[string]()),
# newColumnSelectorInfo("Case Vol (m3)", "float", false, opt.none[string]()),
# newColumnSelectorInfo("Case Gross Weight (KG)", "float", false, opt.none[string]()),
# newColumnSelectorInfo("Inner L (m)", "float", false, opt.none[string]()),
# newColumnSelectorInfo("Inner Weight", "float", false, opt.none[string]()),
# newColumnSelectorInfo("STOs", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Sum of STO Qty", "str", false, opt.none[string]()),
# newColumnSelectorInfo("Pallet Ti", "int", false, opt.none[string]()),
# newColumnSelectorInfo("Pallet Hi", "int", false, opt.none[string]()),
# newColumnSelectorInfo("LAY", "str", false, opt.none[string]()),
])



let (select_pass, select_fail) = table.columnSelect(
select_cols,
nimpy.pyImport("tqdm").tqdm,
dir_pid = workdir / Path(pid),
Taskmanager = modules().mplite.classes.TaskManager
)

discard select_pass.show(dtype = true)
discard select_fail.show(dtype = true)
74 changes: 23 additions & 51 deletions nimlite/funcs/filter.nim
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,18 @@ proc filter*(table: nimpy.PyObject, pyExpressions: seq[nimpy.PyObject], filterTy
template dumpPage(columns: seq[string], passColumn: nimpy.PyObject, failColumn: nimpy.PyObject): void =
var firstPage = 0
var currentOffset = 0
var maskOffset = 0

template dumpSlice(slice: seq[int], originalPage: BaseNDArray, column: nimpy.PyObject): void =
if slice.len > 0:
let pgid = base.classes.SimplePageClass.next_id(string basedir).to(string)
let pgPath = string(pagedir / Path(pgid & ".npy"))
let page = originalPage[slice]
page.save(pgPath)

let pyPage = newPyPage(page, string basedir, pgid)

discard column.pages.append(pyPage)

while true:
var len = getPageLen(columns[firstPage])
Expand All @@ -225,17 +237,15 @@ proc filter*(table: nimpy.PyObject, pyExpressions: seq[nimpy.PyObject], filterTy
inc firstPage
currentOffset = currentOffset + len

var maskOffset = 0

let indiceOffset = offset - currentOffset
var indiceOffset = offset - currentOffset
var maskLeftOver = bitNum - maskOffset

while maskOffset < bitNum:
while maskLeftOver > 0:
let page = readNumpy(columns[firstPage])

let len = page.len
let sliceMax = min((bitNum - maskOffset), len)
let sliceLen = sliceMax - maskOffset
let slice = maskOffset..<sliceMax
let len = (page.len - indiceOffset)
let sliceLen = min(maskLeftOver, len)
let slice = maskOffset..<(sliceLen + maskOffset)

var validIndices = newSeqOfCap[int](sliceLen - (sliceLen shr 2))
var invalidIndices = newSeqOfCap[int](sliceLen shr 2)
Expand All @@ -244,25 +254,12 @@ proc filter*(table: nimpy.PyObject, pyExpressions: seq[nimpy.PyObject], filterTy
if m: validIndices.add(i + indiceOffset)
else: invalidIndices.add(i + indiceOffset)

let passPid = base.classes.SimplePageClass.next_id(string basedir).to(string)
let failPid = base.classes.SimplePageClass.next_id(string basedir).to(string)

let passPath = string(pagedir / Path(passPid & ".npy"))
let failPath = string(pagedir / Path(failPid & ".npy"))

let passPage = page[validIndices]
let failPage = page[invalidIndices]

passPage.save(passPath)
failPage.save(failPath)

let passPagePy = newPyPage(passPage, string basedir, passPid)
let failPagePy = newPyPage(failPage, string basedir, failPid)

discard passColumn.pages.append(passPagePy)
discard failColumn.pages.append(failPagePy)
validIndices.dumpSlice(page, passColumn)
invalidIndices.dumpSlice(page, failColumn)

maskOffset = maskOffset + sliceLen
maskLeftOver = maskLeftOver - sliceLen
indiceOffset = 0
inc firstPage

template dumpPages(tablePages: Table[string, seq[string]]): void =
Expand All @@ -272,7 +269,7 @@ proc filter*(table: nimpy.PyObject, pyExpressions: seq[nimpy.PyObject], filterTy
let tableLen = builtins.getLen(table)
let tqdmLen = int ceil(float(tableLen) / float(pageSize))
let TqdmClass = (if isNone(tqdm): m.tqdm.classes.TqdmClass else: tqdm)
let pbar = TqdmClass!(total: tqdmLen, desc="filter")
let pbar = TqdmClass!(total: tqdmLen, desc = "filter")

for (i, row) in enumerate(exprCols.iterateRows(tablePages)):
bitmask[bitNum] = row.checkExpressions(exprCols, expressions, filterType)
Expand Down Expand Up @@ -303,28 +300,3 @@ proc filter*(table: nimpy.PyObject, pyExpressions: seq[nimpy.PyObject], filterTy
discard pbar.close()

return (passTable, failTable)


when appType != "lib":
let m = modules()
let Config = m.tablite.modules.config.classes.Config

# Config.PAGE_SIZE = 2

let table = m.tablite.classes.TableClass!({
"a": @[1, 2, 3, 4],
"b": @[10, 20, 30, 40],
"c": @[4, 4, 4, 4]
}.toTable)
let pyExpressions = @[
m.builtins.classes.DictClass!(column1: "a", criteria: ">=", value2: 2),
# m.builtins.classes.DictClass!(column1: "b", criteria: "==", value2: 20),
m.builtins.classes.DictClass!(column1: "a", criteria: "==", column2: "c"),
]

Config.PAGE_SIZE = 2

let (tblPass, tblFail) = filter(table, pyExpressions, "all", nil)

discard tblPass.show()
discard tblFail.show()
Loading
Loading