Skip to content

Commit

Permalink
Merge pull request #158 from omenSi/groupby
Browse files Browse the repository at this point in the history
Groupby ported to nim
  • Loading branch information
omenSi authored Apr 5, 2024
2 parents ab32d28 + f11cc68 commit d5cd0de
Show file tree
Hide file tree
Showing 12 changed files with 957 additions and 541 deletions.
754 changes: 754 additions & 0 deletions nimlite/funcs/groupby.nim

Large diffs are not rendered by default.

16 changes: 8 additions & 8 deletions nimlite/funcs/imputation.nim
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,14 @@ proc uniqueColumnValues(pagePaths: seq[string]): seq[PY_ObjectND] =
uniqueVals.add(v)
result = uniqueVals

method toFloat(self: PY_ObjectND): float {.base, inline.} = implement("PY_ObjectND.`toFloat` must be implemented by inheriting class: " & $self.kind)
method toFloat(self: PY_NoneType): float = -Inf
method toFloat(self: PY_Boolean): float = float(self.value)
method toFloat(self: PY_Int): float = float(self.value)
method toFloat(self: PY_Float): float = self.value
method toFloat(self: PY_Date): float = self.value.toTime().toUnixFloat()
method toFloat(self: PY_Time): float = self.value.duration2Seconds()
method toFloat(self: PY_DateTime): float = self.value.toTime().toUnixFloat()
method toFloat*(self: PY_ObjectND): float {.base, inline.} = implement("PY_ObjectND.`toFloat` must be implemented by inheriting class: " & $self.kind)
method toFloat*(self: PY_NoneType): float = -Inf
method toFloat*(self: PY_Boolean): float = float(self.value)
method toFloat*(self: PY_Int): float = float(self.value)
method toFloat*(self: PY_Float): float = self.value
method toFloat*(self: PY_Date): float = self.value.toTime().toUnixFloat()
method toFloat*(self: PY_Time): float = self.value.duration2Seconds()
method toFloat*(self: PY_DateTime): float = self.value.toTime().toUnixFloat()

proc cmpNonText(this, other: PY_ObjectND): int =
let r = system.cmp[int](PARITY_TABLE[this.kind], PARITY_TABLE[other.kind])
Expand Down
9 changes: 9 additions & 0 deletions nimlite/libnimlite.nim
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,12 @@ when isLib:
raise newException(ValueError, "unrecognized type.")
return nearestNeighbourImputation(T, sources, miss, targets, tqdm)
# -------- IMPUTATION -----------

# -------- GROUPBY -----------
import funcs/groupby as gb
proc groupby(T: nimpy.PyObject, keys: seq[string], functions: seq[(string, string)], tqdm: nimpy.PyObject): nimpy.PyObject {. exportpy .} =
var funcs = collect:
for (cn, fn) in functions:
(cn, str2Accumulator(fn))
return gb.groupby(T, keys, funcs, tqdm)
# -------- GROUPBY -----------
2 changes: 2 additions & 0 deletions nimlite/libnimlite.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def collect_text_reader_page_info_task(task_info, task):
def nearest_neighbour(T, sources, missing, targets, tqdm):
pass

def groupby(T, keys, functions, tqdm):
pass

def filter(table, expressions, type, tqdm):
pass
18 changes: 17 additions & 1 deletion nimlite/numpy.nim
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import std/[os, unicode, strutils, sugar, times, tables, enumerate, sequtils, paths, hashes]
import std/[os, unicode, strutils, sugar, times, tables, enumerate, sequtils, paths, hashes, strformat]
from std/macros import bindSym
from std/typetraits import name
from std/math import ceil
Expand Down Expand Up @@ -1429,6 +1429,22 @@ method toHash(self: PY_String): Hash = hash((self.kind, self.value))
proc hash*(self: PY_ObjectND): Hash = self.toHash()
proc hash*(self: seq[PY_ObjectND]): Hash = hash(self, 0, self.high)

proc slice*(table: nimpy.PyObject, columnNames: openArray[string]): nimpy.PyObject =
let
m = modules()
tabliteBase = m.tablite.modules.base
tabliteConf = m.tablite.modules.config.classes.Config
pid: string = tabliteConf.pid.to(string)
workDir: string = m.toStr(tabliteConf.workdir)
pidDir: string = &"{workDir}/{pid}"

var t = m.tablite.classes.TableClass!()
for name in columnNames:
var c = tabliteBase.classes.ColumnClass!(pidDir)
for p in table[name].pages:
discard c.pages.append(p)
t[name] = c
return t

proc index*(table: nimpy.PyObject, columnNames: openArray[string]): TableIndices =
var d = initOrderedTable[seq[PY_ObjectND], seq[int]]()
Expand Down
4 changes: 2 additions & 2 deletions tablite/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@
from tablite import lookup
from tablite import match
from tablite import sortation
from tablite import groupbys
from tablite import pivots
from tablite import imputation
from tablite import diff
from tablite.config import Config
from tablite.nimlite import column_select as _column_select, ColumnSelectorDict, ValidSkipEmpty
from tablite.nimlite import groupby as _groupby
from mplite import TaskManager as _TaskManager

logging.getLogger("lml").propagate = False
Expand Down Expand Up @@ -611,7 +611,7 @@ def groupby(self, keys, functions, tqdm=_tqdm, pbar=None):
https://github.com/root-11/tablite/blob/master/tests/test_groupby.py
"""
return groupbys.groupby(self, keys, functions, tqdm=tqdm, pbar=pbar)
return _groupby(self, keys, functions, tqdm)

def pivot(self, rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=None):
"""
Expand Down
212 changes: 12 additions & 200 deletions tablite/groupby_utils.py
Original file line number Diff line number Diff line change
@@ -1,201 +1,13 @@
from collections import defaultdict
from datetime import date, time, datetime, timedelta # noqa


class GroupbyFunction(object):
pass


class Limit(GroupbyFunction):
def __init__(self):
self.value = None
self.f = None

def update(self, value):
if value is None:
pass
elif self.value is None:
self.value = value
else:
self.value = self.f((value, self.value))


class Max(Limit):
def __init__(self):
super().__init__()
self.f = max


class Min(Limit):
def __init__(self):
super().__init__()
self.f = min


class Sum(GroupbyFunction):
def __init__(self):
self.value = 0

def update(self, value):
if isinstance(value, (type(None), date, time, datetime, str)):
raise ValueError(f"Sum of {type(value)} doesn't make sense.")
self.value += value


class Product(GroupbyFunction):
def __init__(self) -> None:
self.value = 1

def update(self, value):
self.value *= value


class First(GroupbyFunction):
empty = (None,)
# we will never receive a tuple, so using (None,) as the initial
# value will assure that IF None is the first value, then it can
# be captured correctly.

def __init__(self):
self.value = self.empty

def update(self, value):
if self.value is First.empty:
self.value = value


class Last(GroupbyFunction):
def __init__(self):
self.value = None

def update(self, value):
self.value = value


class Count(GroupbyFunction):
def __init__(self):
self.value = 0

def update(self, value):
self.value += 1


class CountUnique(GroupbyFunction):
def __init__(self):
self.items = set()
self.value = None

def update(self, value):
self.items.add(value)
self.value = len(self.items)


class Average(GroupbyFunction):
def __init__(self):
self.sum = 0
self.count = 0
self.value = 0

def update(self, value):
if isinstance(value, (date, time, datetime, str)):
raise ValueError(f"Sum of {type(value)} doesn't make sense.")
if value is not None:
self.sum += value
self.count += 1
self.value = self.sum / self.count


class StandardDeviation(GroupbyFunction):
"""
Uses J.P. Welfords (1962) algorithm.
For details see https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm
"""

def __init__(self):
self.count = 0
self.mean = 0
self.c = 0.0

def update(self, value):
if isinstance(value, (date, time, datetime, str)):
raise ValueError(f"Std.dev. of {type(value)} doesn't make sense.")
if value is not None:
self.count += 1
dt = value - self.mean
self.mean += dt / self.count
self.c += dt * (value - self.mean)

@property
def value(self):
if self.count <= 1:
return 0.0
variance = self.c / (self.count - 1)
return variance ** (1 / 2)


class Histogram(GroupbyFunction):
def __init__(self):
self.hist = defaultdict(int)

def update(self, value):
self.hist[value] += 1


class Median(Histogram):
def __init__(self):
super().__init__()

@property
def value(self):
if not self.hist:
raise ValueError("No data.")

keys = len(self.hist.keys())
if keys == 1:
for k in self.hist:
return k
elif keys % 2 == 0:
A, B, total, midpoint = None, None, 0, sum(self.hist.values()) / 2
for k, v in sorted(self.hist.items()):
total += v
A, B = B, k
if total > midpoint:
return (A + B) / 2
else:
midpoint = sum(self.hist.values()) / 2
total = 0
for k, v in sorted(self.hist.items()):
total += v
if total > midpoint:
return k


class Mode(Histogram):
def __init__(self):
super().__init__()

@property
def value(self):
L = [(v, k) for k, v in self.hist.items()]
L.sort(reverse=True)
_, most_frequent = L[0] # top of the list.
return most_frequent


class GroupBy(object):
max = Max # shortcuts to avoid having to type a long list of imports.
min = Min
sum = Sum
product = Product
first = First
last = Last
count = Count
count_unique = CountUnique
avg = Average
stdev = StandardDeviation
median = Median
mode = Mode

functions = [Max, Min, Sum, First, Last, Product, Count, CountUnique, Average, StandardDeviation, Median, Mode]

function_names = {f.__name__: f for f in functions}
max = "max"
min = "min"
sum = "sum"
product = "product"
first = "first"
last = "last"
count = "count"
count_unique = "count_unique"
avg = "avg"
stdev = "stdev"
median = "median"
mode = "mode"
Loading

0 comments on commit d5cd0de

Please sign in to comment.