Merge pull request #158 from omenSi/groupby

Groupby ported to nim
root-11 · Apr 5, 2024 · d5cd0de · d5cd0de
2 parents ab32d28 + f11cc68
commit d5cd0de
Show file tree

Hide file tree

Showing 12 changed files with 957 additions and 541 deletions.
diff --git a/nimlite/funcs/groupby.nim b/nimlite/funcs/groupby.nim
diff --git a/nimlite/funcs/imputation.nim b/nimlite/funcs/imputation.nim
@@ -25,14 +25,14 @@ proc uniqueColumnValues(pagePaths: seq[string]): seq[PY_ObjectND] =
             uniqueVals.add(v)
     result = uniqueVals
 
-method toFloat(self: PY_ObjectND): float {.base, inline.} = implement("PY_ObjectND.`toFloat` must be implemented by inheriting class: " & $self.kind)
-method toFloat(self: PY_NoneType): float = -Inf
-method toFloat(self: PY_Boolean): float = float(self.value)
-method toFloat(self: PY_Int): float = float(self.value)
-method toFloat(self: PY_Float): float = self.value
-method toFloat(self: PY_Date): float = self.value.toTime().toUnixFloat()
-method toFloat(self: PY_Time): float = self.value.duration2Seconds()
-method toFloat(self: PY_DateTime): float = self.value.toTime().toUnixFloat()
+method toFloat*(self: PY_ObjectND): float {.base, inline.} = implement("PY_ObjectND.`toFloat` must be implemented by inheriting class: " & $self.kind)
+method toFloat*(self: PY_NoneType): float = -Inf
+method toFloat*(self: PY_Boolean): float = float(self.value)
+method toFloat*(self: PY_Int): float = float(self.value)
+method toFloat*(self: PY_Float): float = self.value
+method toFloat*(self: PY_Date): float = self.value.toTime().toUnixFloat()
+method toFloat*(self: PY_Time): float = self.value.duration2Seconds()
+method toFloat*(self: PY_DateTime): float = self.value.toTime().toUnixFloat()
 
 proc cmpNonText(this, other: PY_ObjectND): int =
     let r = system.cmp[int](PARITY_TABLE[this.kind], PARITY_TABLE[other.kind])

diff --git a/nimlite/libnimlite.nim b/nimlite/libnimlite.nim
@@ -152,3 +152,12 @@ when isLib:
                 raise newException(ValueError, "unrecognized type.")
         return nearestNeighbourImputation(T, sources, miss, targets, tqdm)
     # --------  IMPUTATION  -----------
+
+    # --------   GROUPBY  -----------
+    import funcs/groupby as gb
+    proc groupby(T: nimpy.PyObject, keys: seq[string], functions: seq[(string, string)], tqdm: nimpy.PyObject): nimpy.PyObject {. exportpy .} =
+        var funcs = collect:
+            for (cn, fn) in functions:
+                (cn, str2Accumulator(fn))
+        return gb.groupby(T, keys, funcs, tqdm)
+    # --------   GROUPBY  -----------
diff --git a/nimlite/libnimlite.pyi b/nimlite/libnimlite.pyi
@@ -33,6 +33,8 @@ def collect_text_reader_page_info_task(task_info, task):
 def nearest_neighbour(T, sources, missing, targets, tqdm):
     pass
 
+def groupby(T, keys, functions, tqdm):
+    pass
 
 def filter(table, expressions, type, tqdm):
     pass
diff --git a/nimlite/numpy.nim b/nimlite/numpy.nim
@@ -1,4 +1,4 @@
-import std/[os, unicode, strutils, sugar, times, tables, enumerate, sequtils, paths, hashes]
+import std/[os, unicode, strutils, sugar, times, tables, enumerate, sequtils, paths, hashes, strformat]
 from std/macros import bindSym
 from std/typetraits import name
 from std/math import ceil
@@ -1429,6 +1429,22 @@ method toHash(self: PY_String): Hash = hash((self.kind, self.value))
 proc hash*(self: PY_ObjectND): Hash = self.toHash()
 proc hash*(self: seq[PY_ObjectND]): Hash = hash(self, 0, self.high)
 
+proc slice*(table: nimpy.PyObject, columnNames: openArray[string]): nimpy.PyObject =
+    let
+        m = modules()
+        tabliteBase = m.tablite.modules.base
+        tabliteConf = m.tablite.modules.config.classes.Config
+        pid: string = tabliteConf.pid.to(string)
+        workDir: string = m.toStr(tabliteConf.workdir)
+        pidDir: string = &"{workDir}/{pid}"
+
+    var t = m.tablite.classes.TableClass!()
+    for name in columnNames:
+        var c = tabliteBase.classes.ColumnClass!(pidDir)
+        for p in table[name].pages:
+            discard c.pages.append(p)
+        t[name] = c
+    return t
 
 proc index*(table: nimpy.PyObject, columnNames: openArray[string]): TableIndices =
     var d = initOrderedTable[seq[PY_ObjectND], seq[int]]()

diff --git a/tablite/core.py b/tablite/core.py
@@ -17,12 +17,12 @@
 from tablite import lookup
 from tablite import match
 from tablite import sortation
-from tablite import groupbys
 from tablite import pivots
 from tablite import imputation
 from tablite import diff
 from tablite.config import Config
 from tablite.nimlite import column_select as _column_select, ColumnSelectorDict, ValidSkipEmpty
+from tablite.nimlite import groupby as _groupby
 from mplite import TaskManager as _TaskManager
 
 logging.getLogger("lml").propagate = False
@@ -611,7 +611,7 @@ def groupby(self, keys, functions, tqdm=_tqdm, pbar=None):
             https://github.com/root-11/tablite/blob/master/tests/test_groupby.py
 
         """
-        return groupbys.groupby(self, keys, functions, tqdm=tqdm, pbar=pbar)
+        return _groupby(self, keys, functions, tqdm)
 
     def pivot(self, rows, columns, functions, values_as_rows=True, tqdm=_tqdm, pbar=None):
         """

diff --git a/tablite/groupby_utils.py b/tablite/groupby_utils.py
@@ -1,201 +1,13 @@
-from collections import defaultdict
-from datetime import date, time, datetime, timedelta  # noqa
-
-
-class GroupbyFunction(object):
-    pass
-
-
-class Limit(GroupbyFunction):
-    def __init__(self):
-        self.value = None
-        self.f = None
-
-    def update(self, value):
-        if value is None:
-            pass
-        elif self.value is None:
-            self.value = value
-        else:
-            self.value = self.f((value, self.value))
-
-
-class Max(Limit):
-    def __init__(self):
-        super().__init__()
-        self.f = max
-
-
-class Min(Limit):
-    def __init__(self):
-        super().__init__()
-        self.f = min
-
-
-class Sum(GroupbyFunction):
-    def __init__(self):
-        self.value = 0
-
-    def update(self, value):
-        if isinstance(value, (type(None), date, time, datetime, str)):
-            raise ValueError(f"Sum of {type(value)} doesn't make sense.")
-        self.value += value
-
-
-class Product(GroupbyFunction):
-    def __init__(self) -> None:
-        self.value = 1
-
-    def update(self, value):
-        self.value *= value
-
-
-class First(GroupbyFunction):
-    empty = (None,)
-    # we will never receive a tuple, so using (None,) as the initial
-    # value will assure that IF None is the first value, then it can
-    # be captured correctly.
-
-    def __init__(self):
-        self.value = self.empty
-
-    def update(self, value):
-        if self.value is First.empty:
-            self.value = value
-
-
-class Last(GroupbyFunction):
-    def __init__(self):
-        self.value = None
-
-    def update(self, value):
-        self.value = value
-
-
-class Count(GroupbyFunction):
-    def __init__(self):
-        self.value = 0
-
-    def update(self, value):
-        self.value += 1
-
-
-class CountUnique(GroupbyFunction):
-    def __init__(self):
-        self.items = set()
-        self.value = None
-
-    def update(self, value):
-        self.items.add(value)
-        self.value = len(self.items)
-
-
-class Average(GroupbyFunction):
-    def __init__(self):
-        self.sum = 0
-        self.count = 0
-        self.value = 0
-
-    def update(self, value):
-        if isinstance(value, (date, time, datetime, str)):
-            raise ValueError(f"Sum of {type(value)} doesn't make sense.")
-        if value is not None:
-            self.sum += value
-            self.count += 1
-            self.value = self.sum / self.count
-
-
-class StandardDeviation(GroupbyFunction):
-    """
-    Uses J.P. Welfords (1962) algorithm.
-    For details see https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Online_algorithm
-    """
-
-    def __init__(self):
-        self.count = 0
-        self.mean = 0
-        self.c = 0.0
-
-    def update(self, value):
-        if isinstance(value, (date, time, datetime, str)):
-            raise ValueError(f"Std.dev. of {type(value)} doesn't make sense.")
-        if value is not None:
-            self.count += 1
-            dt = value - self.mean
-            self.mean += dt / self.count
-            self.c += dt * (value - self.mean)
-
-    @property
-    def value(self):
-        if self.count <= 1:
-            return 0.0
-        variance = self.c / (self.count - 1)
-        return variance ** (1 / 2)
-
-
-class Histogram(GroupbyFunction):
-    def __init__(self):
-        self.hist = defaultdict(int)
-
-    def update(self, value):
-        self.hist[value] += 1
-
-
-class Median(Histogram):
-    def __init__(self):
-        super().__init__()
-
-    @property
-    def value(self):
-        if not self.hist:
-            raise ValueError("No data.")
-
-        keys = len(self.hist.keys())
-        if keys == 1:
-            for k in self.hist:
-                return k
-        elif keys % 2 == 0:
-            A, B, total, midpoint = None, None, 0, sum(self.hist.values()) / 2
-            for k, v in sorted(self.hist.items()):
-                total += v
-                A, B = B, k
-                if total > midpoint:
-                    return (A + B) / 2
-        else:
-            midpoint = sum(self.hist.values()) / 2
-            total = 0
-            for k, v in sorted(self.hist.items()):
-                total += v
-                if total > midpoint:
-                    return k
-
-
-class Mode(Histogram):
-    def __init__(self):
-        super().__init__()
-
-    @property
-    def value(self):
-        L = [(v, k) for k, v in self.hist.items()]
-        L.sort(reverse=True)
-        _, most_frequent = L[0]  # top of the list.
-        return most_frequent
-
-
 class GroupBy(object):
-    max = Max  # shortcuts to avoid having to type a long list of imports.
-    min = Min
-    sum = Sum
-    product = Product
-    first = First
-    last = Last
-    count = Count
-    count_unique = CountUnique
-    avg = Average
-    stdev = StandardDeviation
-    median = Median
-    mode = Mode
-
-    functions = [Max, Min, Sum, First, Last, Product, Count, CountUnique, Average, StandardDeviation, Median, Mode]
-
-    function_names = {f.__name__: f for f in functions}
+    max = "max"
+    min = "min"
+    sum = "sum"
+    product = "product"
+    first = "first"
+    last = "last"
+    count = "count"
+    count_unique = "count_unique"
+    avg = "avg"
+    stdev = "stdev"
+    median = "median"
+    mode = "mode"