diff --git a/.gitignore b/.gitignore index 4b36ef7..5b8a455 100644 --- a/.gitignore +++ b/.gitignore @@ -53,6 +53,7 @@ coverage.xml *.cover .hypothesis/ /.pytest_cache/ +fuzz_failures/* # Translations *.mo diff --git a/docs/source/_templates/autosummary/accessor_method.rst b/docs/source/_templates/accessor_method.rst similarity index 100% rename from docs/source/_templates/autosummary/accessor_method.rst rename to docs/source/_templates/accessor_method.rst diff --git a/docs/source/_templates/autosummary/short_title.rst b/docs/source/_templates/autosummary/short_title.rst deleted file mode 100644 index 5536fa1..0000000 --- a/docs/source/_templates/autosummary/short_title.rst +++ /dev/null @@ -1,5 +0,0 @@ -{{ name | escape | underline}} - -.. currentmodule:: {{ module }} - -.. auto{{ objtype }}:: {{ objname }} diff --git a/docs/source/_templates/short_title.rst b/docs/source/_templates/short_title.rst new file mode 100644 index 0000000..79f63eb --- /dev/null +++ b/docs/source/_templates/short_title.rst @@ -0,0 +1,25 @@ +{% if (name == "__add__") %}\+ +{% elif name == "__sub__" %}\- +{% elif name == "__mul__" %}\* +{% elif name == "__truediv__" %}\/ +{% elif name == "__floordiv__" %}\/\/ +{% elif name == "__pow__" %}\*\* (pow) +{% elif name == "__mod__" %}\% +{% elif name == "__pos__" %}\+ (unary) +{% elif name == "__neg__" %}\- (unary) +{% elif name == "__lt__" %}\< +{% elif name == "__le__" %}\<\= +{% elif name == "__gt__" %}\> +{% elif name == "__ge__" %}\>\= +{% elif name == "__eq__" %}\=\= +{% elif name == "__ne__" %}\!\= +{% elif name == "__or__" %}\| +{% elif name == "__and__" %}\& +{% elif name == "__xor__" %}\^ +{% elif name == "__invert__" %}\~ +{% else %}{{ name }} +{% endif %}{{ underline }} + +.. currentmodule:: {{ module }} + +.. auto{{ objtype }}:: {{ objname }} diff --git a/docs/source/conf.py b/docs/source/conf.py index 308422f..c76a5c7 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -37,9 +37,8 @@ "sphinx.ext.autosummary", "sphinx.ext.autosectionlabel", "sphinx.ext.intersphinx", - # "sphinx.ext.viewcode", "sphinx.ext.napoleon", - "sphinx_autosummary_accessors" + "sphinx_autosummary_accessors", ] maximum_signature_line_length = 100 diff --git a/docs/source/reference/operators/aggregation.rst b/docs/source/reference/operators/aggregation.rst index dfd9e23..e8ad630 100644 --- a/docs/source/reference/operators/aggregation.rst +++ b/docs/source/reference/operators/aggregation.rst @@ -16,7 +16,7 @@ Aggregation functions take a ``partition_by`` and ``filter`` keyword argument. T .. autosummary:: :toctree: _generated/ :nosignatures: - :template: autosummary/short_title.rst + :template: short_title.rst count all diff --git a/docs/source/reference/operators/arithmetic.rst b/docs/source/reference/operators/arithmetic.rst index ade1eb9..99dd145 100644 --- a/docs/source/reference/operators/arithmetic.rst +++ b/docs/source/reference/operators/arithmetic.rst @@ -6,13 +6,13 @@ Arithmetic .. autosummary:: :toctree: _generated/ :nosignatures: - :template: autosummary/short_title.rst + :template: short_title.rst __add__ - __floordiv__ - __mod__ + __sub__ __mul__ + __truediv__ __neg__ __pos__ - __sub__ - __truediv__ + __floordiv__ + __mod__ diff --git a/docs/source/reference/operators/comparison.rst b/docs/source/reference/operators/comparison.rst index 405bf98..1b5273a 100644 --- a/docs/source/reference/operators/comparison.rst +++ b/docs/source/reference/operators/comparison.rst @@ -6,7 +6,7 @@ Comparison .. autosummary:: :toctree: _generated/ :nosignatures: - :template: autosummary/short_title.rst + :template: short_title.rst __eq__ __ge__ diff --git a/docs/source/reference/operators/conditional_logic.rst b/docs/source/reference/operators/conditional_logic.rst index 86dbdf2..78e2d7e 100644 --- a/docs/source/reference/operators/conditional_logic.rst +++ b/docs/source/reference/operators/conditional_logic.rst @@ -6,7 +6,7 @@ Conditional Logic .. autosummary:: :toctree: _generated/ - :template: autosummary/short_title.rst + :template: short_title.rst :nosignatures: when diff --git a/docs/source/reference/operators/datetime.rst b/docs/source/reference/operators/datetime.rst index ed5b6a2..78290ae 100644 --- a/docs/source/reference/operators/datetime.rst +++ b/docs/source/reference/operators/datetime.rst @@ -6,7 +6,7 @@ Datetime / Duration .. autosummary:: :toctree: _generated/ :nosignatures: - :template: autosummary/accessor_method.rst + :template: accessor_method.rst dt.day dt.day_of_week diff --git a/docs/source/reference/operators/horizontal_aggregation.rst b/docs/source/reference/operators/horizontal_aggregation.rst index 0ab6ead..64ce7bf 100644 --- a/docs/source/reference/operators/horizontal_aggregation.rst +++ b/docs/source/reference/operators/horizontal_aggregation.rst @@ -6,7 +6,7 @@ Horizontal Aggregation .. autosummary:: :toctree: _generated/ :nosignatures: - :template: autosummary/short_title.rst + :template: short_title.rst coalesce count diff --git a/docs/source/reference/operators/index.rst b/docs/source/reference/operators/index.rst index dba9af8..358169a 100644 --- a/docs/source/reference/operators/index.rst +++ b/docs/source/reference/operators/index.rst @@ -113,6 +113,8 @@ Global functions .. autosummary:: :nosignatures: + all + any coalesce count dense_rank @@ -121,4 +123,5 @@ Global functions min rank row_number + sum when diff --git a/docs/source/reference/operators/logical.rst b/docs/source/reference/operators/logical.rst index c13980b..5af6576 100644 --- a/docs/source/reference/operators/logical.rst +++ b/docs/source/reference/operators/logical.rst @@ -6,7 +6,7 @@ Logical .. autosummary:: :toctree: _generated/ :nosignatures: - :template: autosummary/short_title.rst + :template: short_title.rst __and__ __invert__ diff --git a/docs/source/reference/operators/numerical.rst b/docs/source/reference/operators/numerical.rst index ab1868a..8931bc1 100644 --- a/docs/source/reference/operators/numerical.rst +++ b/docs/source/reference/operators/numerical.rst @@ -6,7 +6,7 @@ Numerical .. autosummary:: :toctree: _generated/ :nosignatures: - :template: autosummary/short_title.rst + :template: short_title.rst __pow__ abs diff --git a/docs/source/reference/operators/sorting_markers.rst b/docs/source/reference/operators/sorting_markers.rst index 2de6cb1..120cbd9 100644 --- a/docs/source/reference/operators/sorting_markers.rst +++ b/docs/source/reference/operators/sorting_markers.rst @@ -6,7 +6,7 @@ Sorting Markers .. autosummary:: :toctree: _generated/ :nosignatures: - :template: autosummary/short_title.rst + :template: short_title.rst ascending descending diff --git a/docs/source/reference/operators/string.rst b/docs/source/reference/operators/string.rst index 52c6a13..40d3108 100644 --- a/docs/source/reference/operators/string.rst +++ b/docs/source/reference/operators/string.rst @@ -6,7 +6,7 @@ String .. autosummary:: :toctree: _generated/ :nosignatures: - :template: autosummary/accessor_method.rst + :template: accessor_method.rst str.contains str.ends_with diff --git a/docs/source/reference/operators/type_conversion.rst b/docs/source/reference/operators/type_conversion.rst index 0d18bfc..037d325 100644 --- a/docs/source/reference/operators/type_conversion.rst +++ b/docs/source/reference/operators/type_conversion.rst @@ -6,7 +6,7 @@ Type Conversion .. autosummary:: :toctree: _generated/ - :template: autosummary/short_title.rst + :template: short_title.rst :nosignatures: lit diff --git a/docs/source/reference/operators/window.rst b/docs/source/reference/operators/window.rst index ec2c46a..ee4d14e 100644 --- a/docs/source/reference/operators/window.rst +++ b/docs/source/reference/operators/window.rst @@ -6,7 +6,7 @@ Window .. autosummary:: :toctree: _generated/ :nosignatures: - :template: autosummary/short_title.rst + :template: short_title.rst shift @@ -14,7 +14,7 @@ Window .. autosummary:: :toctree: _generated/ :nosignatures: - :template: autosummary/short_title.rst + :template: short_title.rst dense_rank rank diff --git a/docs/source/reference/targets.rst b/docs/source/reference/targets.rst index bb2c603..38df3ff 100644 --- a/docs/source/reference/targets.rst +++ b/docs/source/reference/targets.rst @@ -6,7 +6,7 @@ Backends / Export Targets .. autosummary:: :toctree: _generated/ :nosignatures: - :template: autosummary/short_title.rst + :template: short_title.rst DuckDb Pandas diff --git a/docs/source/reference/types.rst b/docs/source/reference/types.rst index 97b69fc..1faefaf 100644 --- a/docs/source/reference/types.rst +++ b/docs/source/reference/types.rst @@ -6,7 +6,7 @@ Types .. autosummary:: :toctree: _generated/ :nosignatures: - :template: autosummary/short_title.rst + :template: short_title.rst Dtype Bool diff --git a/docs/source/reference/verbs.rst b/docs/source/reference/verbs.rst index 9368bf9..5a2c8b9 100644 --- a/docs/source/reference/verbs.rst +++ b/docs/source/reference/verbs.rst @@ -6,7 +6,7 @@ Verbs .. autosummary:: :toctree: _generated/ :nosignatures: - :template: autosummary/short_title.rst + :template: short_title.rst alias arrange diff --git a/fuzz.py b/fuzz.py new file mode 100644 index 0000000..fca6789 --- /dev/null +++ b/fuzz.py @@ -0,0 +1,144 @@ +# ruff: noqa: F405 + +from __future__ import annotations + +import random +import string +from functools import partial + +import numpy as np +import polars as pl +from polars.testing import assert_frame_equal + +import pydiverse.transform as pdt +from pydiverse.transform._internal.ops import ops +from pydiverse.transform._internal.ops.op import Ftype, Operator +from pydiverse.transform._internal.ops.ops.markers import Marker +from pydiverse.transform._internal.ops.signature import Signature +from pydiverse.transform._internal.tree.col_expr import ColFn +from pydiverse.transform._internal.tree.types import Tvar +from pydiverse.transform.common import * # noqa: F403 +from tests.util.backend import BACKEND_TABLES + +rng = np.random.default_rng() +letters = list(string.printable) + +ALL_TYPES = [pdt.Int(), pdt.Float(), pdt.Bool(), pdt.String()] +MEAN_HEIGHT = 3 + +RNG_FNS = { + pdt.Float(): rng.standard_normal, + pdt.Int(): partial(rng.integers, -(1 << 13), 1 << 13), + pdt.Bool(): partial(rng.integers, 0, 1, dtype=bool), + pdt.String(): ( + lambda rows: np.array( + ["".join(random.choices(letters, k=rng.poisson(10))) for _ in range(rows)] + ) + ), +} + + +def gen_table(rows: int, types: dict[pdt.Dtype, int]) -> pl.DataFrame: + d = pl.DataFrame() + + for ty, fn in RNG_FNS.items(): + if ty in types: + d = d.with_columns( + **{ + f"{ty.__class__.__name__.lower()} #{i+1}": pl.lit(fn(rows)) + for i in range(types[ty]) + } + ) + + return d + + +ops_with_return_type: dict[pdt.Dtype, list[tuple[Operator, Signature]]] = { + ty: [] for ty in ALL_TYPES +} + +for op in ops.__dict__.values(): + if ( + not isinstance(op, Operator) + or op.ftype != Ftype.ELEMENT_WISE + or isinstance(op, Marker) + ): + continue + for sig in op.signatures: + if not all(t in (*ALL_TYPES, Tvar("T")) for t in (*sig.types, sig.return_type)): + continue + + if isinstance(sig.return_type, Tvar) or any( + isinstance(param, Tvar) for param in sig.types + ): + for ty in ALL_TYPES: + rtype = ty if isinstance(sig.return_type, Tvar) else sig.return_type + ops_with_return_type[rtype].append( + ( + op, + Signature( + *( + ty if isinstance(param, Tvar) else param + for param in sig.types + ), + return_type=rtype, + ), + ) + ) + else: + ops_with_return_type[sig.return_type].append((op, sig)) + + +def gen_expr( + dtype: pdt.Dtype, cols: dict[pdt.Dtype, list[str]], q: float = 0.0 +) -> pdt.ColExpr: + if dtype.const: + return RNG_FNS[dtype.without_const()](1).item() + + if q > 1: + # we always use C here so the expression does not have to be generated for each + # backend + return C[rng.choice(cols[dtype])] + + op, sig = rng.choice(ops_with_return_type[dtype]) + assert isinstance(op, Operator) + assert isinstance(sig, Signature) + + args = [] + for param in sig.types[: len(sig.types) - sig.is_vararg]: + args.append(gen_expr(param, cols, q + rng.exponential(1 / MEAN_HEIGHT))) + + if sig.is_vararg: + nargs = int(rng.normal(2.5, 1 / 1.5)) + for _ in range(nargs): + args.append( + gen_expr(sig.types[-1], cols, q + rng.exponential(1 / MEAN_HEIGHT)) + ) + + return ColFn(op, *args) + + +it = int(input("number of iterations: ")) +rows = int(input("number of rows: ")) +seed = int(input("seed: ")) + +rng = np.random.default_rng(seed) +NUM_COLS_PER_TYPE = 5 + +df = gen_table(rows, {dtype: NUM_COLS_PER_TYPE for dtype in ALL_TYPES}) + + +tables = {backend: fn(df, "t") for backend, fn in BACKEND_TABLES.items()} +cols = { + dtype: [col.name for col in tables["polars"] if col.dtype() <= dtype] + for dtype in ALL_TYPES +} + +for _ in range(it): + expr = gen_expr(rng.choice(ALL_TYPES), cols) + results = { + backend: table >> mutate(y=expr) >> select(C.y) >> export(Polars()) + for backend, table in tables.items() + } + for _backend, res in results: + assert_frame_equal(results["polars"], res) diff --git a/src/pydiverse/transform/_internal/backend/mssql.py b/src/pydiverse/transform/_internal/backend/mssql.py index 8298868..3b79bfc 100644 --- a/src/pydiverse/transform/_internal/backend/mssql.py +++ b/src/pydiverse/transform/_internal/backend/mssql.py @@ -24,7 +24,7 @@ LiteralCol, Order, ) -from pydiverse.transform._internal.tree.types import Bool, Datetime, Dtype, String +from pydiverse.transform._internal.tree.types import Bool, Datetime, Dtype, Int, String class MsSqlImpl(SqlImpl): @@ -132,7 +132,6 @@ def convert_bool_bit(expr: ColExpr | Order, wants_bool_as_bit: bool) -> ColExpr elif isinstance(expr, ColFn): wants_args_bool_as_bit = expr.op not in ( - ops.bool_xor, ops.bool_and, ops.bool_or, ops.bool_invert, @@ -302,3 +301,14 @@ def _is_nan(x): @impl(ops.is_not_nan) def _is_not_nan(x): return True + + @impl(ops.pow) + def _pow(x, y): + return_type = sqa.Double() + if isinstance(x.type, sqa.Numeric) and isinstance(y.type, sqa.Numeric): + return_type = sqa.Numeric() + return sqa.func.POWER(x, y, type_=return_type) + + @impl(ops.pow, Int(), Int()) + def _pow_int(x, y): + return sqa.func.POWER(sqa.cast(x, type_=sqa.Double()), y) diff --git a/src/pydiverse/transform/_internal/backend/polars.py b/src/pydiverse/transform/_internal/backend/polars.py index fc24095..ed3b083 100644 --- a/src/pydiverse/transform/_internal/backend/polars.py +++ b/src/pydiverse/transform/_internal/backend/polars.py @@ -215,10 +215,8 @@ def compile_col_expr(expr: ColExpr, name_in_df: dict[UUID, str]) -> pl.Expr: elif isinstance(expr, Cast): if ( - expr.target_type <= Int() - or expr.target_type <= Float() - and expr.val.dtype() <= String() - ): + expr.target_type <= Int() or expr.target_type <= Float() + ) and expr.val.dtype() <= String(): expr.val = expr.val.str.strip() compiled = compile_col_expr(expr.val, name_in_df).cast( polars_type(expr.target_type) @@ -614,13 +612,10 @@ def _shift(x, n, fill_value=None): return x.shift(n, fill_value=fill_value) @impl(ops.is_in) - def _is_in(x, *values, _pdt_args): + def _is_in(x, *values): if len(values) == 0: return pl.lit(False) - return pl.any_horizontal( - (x == val if not arg.dtype() <= NullType() else x.is_null()) - for val, arg in zip(values, _pdt_args[1:], strict=True) - ) + return pl.any_horizontal(x == val for val in values) @impl(ops.str_contains) def _str_contains(x, y): @@ -733,3 +728,7 @@ def _is_not_nan(x): @impl(ops.coalesce) def _coalesce(*x): return pl.coalesce(*x) + + @impl(ops.pow, Int(), Int()) + def _pow(x, y): + return x.cast(pl.Float64()) ** y diff --git a/src/pydiverse/transform/_internal/backend/sql.py b/src/pydiverse/transform/_internal/backend/sql.py index d3d56e6..3ad1bb0 100644 --- a/src/pydiverse/transform/_internal/backend/sql.py +++ b/src/pydiverse/transform/_internal/backend/sql.py @@ -736,14 +736,10 @@ def _floordiv(lhs, rhs): @impl(ops.pow) def _pow(lhs, rhs): - if isinstance(lhs.type, sqa.Float) or isinstance(rhs.type, sqa.Float): - type_ = sqa.Double() - elif isinstance(lhs.type, sqa.Numeric) or isinstance(rhs, sqa.Numeric): - type_ = sqa.Numeric() - else: - type_ = sqa.Double() - - return sqa.func.POW(lhs, rhs, type_=type_) + return_type = sqa.Double() + if isinstance(lhs.type, sqa.Numeric) and isinstance(rhs.type, sqa.Numeric): + return_type = sqa.Numeric() + return sqa.func.POW(lhs, rhs, type_=return_type) @impl(ops.bool_xor) def _xor(lhs, rhs): @@ -763,10 +759,7 @@ def _round(x, decimals=0): @impl(ops.is_in) def _is_in(x, *values): - res = x.in_(v for v in values if not isinstance(v.type, sqa.types.NullType)) - if any(isinstance(v.type, sqa.types.NullType) for v in values): - res = res | x.is_(sqa.null()) - return res + return x.in_(v for v in values) @impl(ops.is_null) def _is_null(x): diff --git a/src/pydiverse/transform/_internal/backend/table_impl.py b/src/pydiverse/transform/_internal/backend/table_impl.py index be4e944..d952f1b 100644 --- a/src/pydiverse/transform/_internal/backend/table_impl.py +++ b/src/pydiverse/transform/_internal/backend/table_impl.py @@ -1,5 +1,6 @@ from __future__ import annotations +import functools import uuid from collections.abc import Generator, Iterable, Sequence from typing import TYPE_CHECKING, Any @@ -227,3 +228,15 @@ def _gt(lhs, rhs): @impl(ops.greater_equal) def _ge(lhs, rhs): return lhs >= rhs + + @impl(ops.horizontal_all) + def _horizontal_all(*args): + return functools.reduce(_and, args) + + @impl(ops.horizontal_any) + def _horizontal_any(*args): + return functools.reduce(_or, args) + + @impl(ops.horizontal_sum) + def _horizontal_sum(*args): + return functools.reduce(_add, args) diff --git a/src/pydiverse/transform/_internal/ops/ops/aggregation.py b/src/pydiverse/transform/_internal/ops/ops/aggregation.py index 4688aee..191e8d9 100644 --- a/src/pydiverse/transform/_internal/ops/ops/aggregation.py +++ b/src/pydiverse/transform/_internal/ops/ops/aggregation.py @@ -34,21 +34,42 @@ def __init__( ) -min = Aggregation("min", *(Signature(dtype, return_type=dtype) for dtype in COMPARABLE)) +min = Aggregation( + "min", + *(Signature(dtype, return_type=dtype) for dtype in COMPARABLE), + doc="Computes the minimum value in each group.", +) -max = Aggregation("max", *(Signature(dtype, return_type=dtype) for dtype in COMPARABLE)) +max = Aggregation( + "max", + *(Signature(dtype, return_type=dtype) for dtype in COMPARABLE), + doc="Computes the maximum value in each group.", +) mean = Aggregation( "mean", *(Signature(dtype, return_type=dtype) for dtype in (Float(), Decimal())), Signature(Int(), return_type=Float()), + doc="Computes the average value in each group.", ) -sum = Aggregation("sum", *(Signature(dtype, return_type=dtype) for dtype in NUMERIC)) +sum = Aggregation( + "sum", + *(Signature(dtype, return_type=dtype) for dtype in NUMERIC), + doc="Computes the sum of values in each group.", +) -any = Aggregation("any", Signature(Bool(), return_type=Bool())) +any = Aggregation( + "any", + Signature(Bool(), return_type=Bool()), + doc="Indicates whether at least one value in a group is True.", +) -all = Aggregation("all", Signature(Bool(), return_type=Bool())) +all = Aggregation( + "all", + Signature(Bool(), return_type=Bool()), + doc="Indicates whether every non-null value in a group is True.", +) count = Aggregation( "count", diff --git a/src/pydiverse/transform/_internal/ops/ops/arithmetic.py b/src/pydiverse/transform/_internal/ops/ops/arithmetic.py index e044af8..1629f1e 100644 --- a/src/pydiverse/transform/_internal/ops/ops/arithmetic.py +++ b/src/pydiverse/transform/_internal/ops/ops/arithmetic.py @@ -18,6 +18,7 @@ *(Signature(dtype, dtype, return_type=dtype) for dtype in NUMERIC), Signature(String(), String(), return_type=String()), Signature(Duration(), Duration(), return_type=Duration()), + doc="Addition +", ) sub = Operator( @@ -27,10 +28,13 @@ Signature(Date(), Date(), return_type=Duration()), Signature(Datetime(), Date(), return_type=Duration()), Signature(Date(), Datetime(), return_type=Duration()), + doc="Subtraction -", ) mul = Operator( - "__mul__", *(Signature(dtype, dtype, return_type=dtype) for dtype in NUMERIC) + "__mul__", + *(Signature(dtype, dtype, return_type=dtype) for dtype in NUMERIC), + doc="Multiplication *", ) truediv = Operator( @@ -38,13 +42,14 @@ Signature(Int(), Int(), return_type=Float()), Signature(Float(), Float(), return_type=Float()), Signature(Decimal(), Decimal(), return_type=Decimal()), + doc="True division /", ) floordiv = Operator( "__floordiv__", Signature(Int(), Int(), return_type=Int()), doc=""" -Integer division. +Integer division // Warning ------- @@ -65,7 +70,7 @@ ... "b": [7, 7, -7, -7], ... } ... ) ->>> t >> mutate(r=t.a // t.b) >> export(Polars()) +>>> t >> mutate(r=t.a // t.b) >> show() shape: (4, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ r │ @@ -84,7 +89,7 @@ "__mod__", Signature(Int(), Int(), return_type=Int()), doc=""" -Computes the remainder of integer division. +The remainder of integer division % Warning ------- @@ -106,7 +111,7 @@ ... "b": [7, 7, -7, -7], ... } ... ) ->>> t >> mutate(r=t.a % t.b) >> export(Polars()) +>>> t >> mutate(r=t.a % t.b) >> show() shape: (4, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ r │ diff --git a/src/pydiverse/transform/_internal/ops/ops/comparison.py b/src/pydiverse/transform/_internal/ops/ops/comparison.py index d7220ae..5e7ef6c 100644 --- a/src/pydiverse/transform/_internal/ops/ops/comparison.py +++ b/src/pydiverse/transform/_internal/ops/ops/comparison.py @@ -4,35 +4,68 @@ from pydiverse.transform._internal.ops.signature import Signature from pydiverse.transform._internal.tree.types import COMPARABLE, Bool, D -equal = Operator("__eq__", Signature(D, D, return_type=Bool())) +equal = Operator( + "__eq__", Signature(D, D, return_type=Bool()), doc="Equality comparison ==" +) -not_equal = Operator("__ne__", Signature(D, D, return_type=Bool())) +not_equal = Operator( + "__ne__", Signature(D, D, return_type=Bool()), doc="Non-equality comparison !=" +) less_than = Operator( "__lt__", *(Signature(t, t, return_type=Bool()) for t in COMPARABLE), - doc=""" -`<` as you know it. -""", + doc="Less than comparison <", ) less_equal = Operator( - "__le__", *(Signature(t, t, return_type=Bool()) for t in COMPARABLE) + "__le__", + *(Signature(t, t, return_type=Bool()) for t in COMPARABLE), + doc="Less than or equal to comparison <=", ) greater_than = Operator( - "__gt__", *(Signature(t, t, return_type=Bool()) for t in COMPARABLE) + "__gt__", + *(Signature(t, t, return_type=Bool()) for t in COMPARABLE), + doc="Greater than comparison >", ) greater_equal = Operator( - "__ge__", *(Signature(t, t, return_type=Bool()) for t in COMPARABLE) + "__ge__", + *(Signature(t, t, return_type=Bool()) for t in COMPARABLE), + doc="Greater than or equal to comparison >=", ) -is_null = Operator("is_null", Signature(D, return_type=Bool())) +is_null = Operator( + "is_null", + Signature(D, return_type=Bool()), + doc="Indicates whether the value is null.", +) -is_not_null = Operator("is_not_null", Signature(D, return_type=Bool())) +is_not_null = Operator( + "is_not_null", + Signature(D, return_type=Bool()), + doc="Indicates whether the value is not null.", +) -fill_null = Operator("fill_null", Signature(D, D, return_type=D)) +fill_null = Operator( + "fill_null", + Signature(D, D, return_type=D), + doc="Replaces every null by the given value.", +) -is_in = Operator("is_in", Signature(D, D, ..., return_type=Bool())) +is_in = Operator( + "is_in", + Signature(D, D, ..., return_type=Bool()), + doc=""" +Whether the value equals one of the given. + +Note +---- +The expression ``t.c.is_in(a1, a2, ...)`` is equivalent to +``(t.c == a1) | (t.c == a2) | ...``, so passing null to ``is_in`` will result in +null. To compare for equality with null, use +:doc:`pydiverse.transform.ColExpr.is_null`. +""", +) diff --git a/src/pydiverse/transform/_internal/ops/ops/datetime.py b/src/pydiverse/transform/_internal/ops/ops/datetime.py index 194f3b0..60c6cf1 100644 --- a/src/pydiverse/transform/_internal/ops/ops/datetime.py +++ b/src/pydiverse/transform/_internal/ops/ops/datetime.py @@ -6,17 +6,21 @@ class DatetimeExtract(Operator): - def __init__(self, name: str, doc: str = ""): - super().__init__(name, Signature(Datetime(), return_type=Int()), doc=doc) + def __init__(self, name: str, doc: str | None = None): + super().__init__( + name, + Signature(Datetime(), return_type=Int()), + doc=doc if doc is not None else f"Extracts the {name[3:]} component.", + ) class DateExtract(Operator): - def __init__(self, name: str, doc: str = ""): + def __init__(self, name: str, doc: str | None = None): super().__init__( name, Signature(Date(), return_type=Int()), Signature(Datetime(), return_type=Int()), - doc=doc, + doc=doc if doc is not None else f"Extracts the {name[3:]} component.", ) @@ -36,9 +40,23 @@ def __init__(self, name: str, doc: str = ""): dt_microsecond = DatetimeExtract("dt.microsecond") -dt_day_of_week = DateExtract("dt.day_of_week") +dt_day_of_week = DateExtract( + "dt.day_of_week", + doc=""" +The number of the current weekday. + +This is one-based, so Monday is 1 and Sunday is 7. +""", +) + +dt_day_of_year = DateExtract( + "dt.day_of_year", + doc=""" +The number of days since the beginning of the year. -dt_day_of_year = DateExtract("dt.day_of_year") +This is one-based, so it returns 1 for the 1st of January. +""", +) class DurationToUnit(Operator): diff --git a/src/pydiverse/transform/_internal/ops/ops/horizontal.py b/src/pydiverse/transform/_internal/ops/ops/horizontal.py index 6fb2b45..6b62a32 100644 --- a/src/pydiverse/transform/_internal/ops/ops/horizontal.py +++ b/src/pydiverse/transform/_internal/ops/ops/horizontal.py @@ -2,7 +2,14 @@ from pydiverse.transform._internal.ops.op import Operator from pydiverse.transform._internal.ops.signature import Signature -from pydiverse.transform._internal.tree.types import COMPARABLE, D +from pydiverse.transform._internal.tree.types import ( + COMPARABLE, + NUMERIC, + Bool, + D, + Duration, + String, +) class Horizontal(Operator): @@ -17,11 +24,124 @@ def __init__(self, name: str, *signatures: Signature, doc: str = ""): horizontal_max = Horizontal( - "max", *(Signature(dtype, dtype, ..., return_type=dtype) for dtype in COMPARABLE) + "max", + *(Signature(dtype, dtype, ..., return_type=dtype) for dtype in COMPARABLE), + doc=""" +The maximum of the given columns. + +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [5, None, 435, -1, 8, None], +... "b": [-45, None, 6, 23, -1, 0], +... "c": [10, None, 2, None, -53, 3], +... } +... ) +>>> t >> mutate(x=pdt.max(t.a, t.b, t.c)) >> show() +Table , backend: PolarsImpl +shape: (6, 4) +┌──────┬──────┬──────┬──────┐ +│ a ┆ b ┆ c ┆ x │ +│ --- ┆ --- ┆ --- ┆ --- │ +│ i64 ┆ i64 ┆ i64 ┆ i64 │ +╞══════╪══════╪══════╪══════╡ +│ 5 ┆ -45 ┆ 10 ┆ 10 │ +│ null ┆ null ┆ null ┆ null │ +│ 435 ┆ 6 ┆ 2 ┆ 435 │ +│ -1 ┆ 23 ┆ null ┆ 23 │ +│ 8 ┆ -1 ┆ -53 ┆ 8 │ +│ null ┆ 0 ┆ 3 ┆ 3 │ +└──────┴──────┴──────┴──────┘ +""", ) horizontal_min = Horizontal( - "min", *(Signature(dtype, dtype, ..., return_type=dtype) for dtype in COMPARABLE) + "min", + *(Signature(dtype, dtype, ..., return_type=dtype) for dtype in COMPARABLE), + doc=""" +The minimum of the given columns. + +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [5, None, 435, -1, 8, None], +... "b": [-45, None, 6, 23, -1, 0], +... "c": [10, None, 2, None, -53, 3], +... } +... ) +>>> t >> mutate(x=pdt.min(t.a, t.b, t.c)) >> show() +Table , backend: PolarsImpl +shape: (6, 4) +┌──────┬──────┬──────┬──────┐ +│ a ┆ b ┆ c ┆ x │ +│ --- ┆ --- ┆ --- ┆ --- │ +│ i64 ┆ i64 ┆ i64 ┆ i64 │ +╞══════╪══════╪══════╪══════╡ +│ 5 ┆ -45 ┆ 10 ┆ -45 │ +│ null ┆ null ┆ null ┆ null │ +│ 435 ┆ 6 ┆ 2 ┆ 2 │ +│ -1 ┆ 23 ┆ null ┆ -1 │ +│ 8 ┆ -1 ┆ -53 ┆ -53 │ +│ null ┆ 0 ┆ 3 ┆ 0 │ +└──────┴──────┴──────┴──────┘ +""", ) -coalesce = Horizontal("coalesce", Signature(D, D, ..., return_type=D)) +coalesce = Horizontal( + "coalesce", + Signature(D, D, ..., return_type=D), + doc=""" +Returns the first non-null value among the given. + +:param arg: + The first value. + +:param args: + Further values. All must have the same type. + +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [5, None, 435, -1, 8, None], +... "b": [-45, None, 6, 23, 1, 0], +... "c": [10, 2, None, None, None, None], +... } +... ) +>>> ( +... t +... >> mutate( +... x=pdt.coalesce(t.a, t.b, t.c), +... y=pdt.coalesce(t.c, t.b, t.a), +... ) +... >> show() +... ) +Table , backend: PolarsImpl +shape: (6, 5) +┌──────┬──────┬──────┬─────┬─────┐ +│ a ┆ b ┆ c ┆ x ┆ y │ +│ --- ┆ --- ┆ --- ┆ --- ┆ --- │ +│ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ +╞══════╪══════╪══════╪═════╪═════╡ +│ 5 ┆ -45 ┆ 10 ┆ 5 ┆ 10 │ +│ null ┆ null ┆ 2 ┆ 2 ┆ 2 │ +│ 435 ┆ 6 ┆ null ┆ 435 ┆ 6 │ +│ -1 ┆ 23 ┆ null ┆ -1 ┆ 23 │ +│ 8 ┆ 1 ┆ null ┆ 8 ┆ 1 │ +│ null ┆ 0 ┆ null ┆ 0 ┆ 0 │ +└──────┴──────┴──────┴─────┴─────┘ +""", +) + +horizontal_any = Horizontal("any", Signature(Bool(), Bool(), ..., return_type=Bool())) + +horizontal_all = Horizontal("all", Signature(Bool(), Bool(), ..., return_type=Bool())) + +horizontal_sum = Horizontal( + "sum", + *(Signature(dtype, dtype, ..., return_type=dtype) for dtype in NUMERIC), + Signature(String(), String(), ..., return_type=String()), + Signature(Duration(), Duration(), ..., return_type=Duration()), +) diff --git a/src/pydiverse/transform/_internal/ops/ops/logical.py b/src/pydiverse/transform/_internal/ops/ops/logical.py index 16b1887..661bc64 100644 --- a/src/pydiverse/transform/_internal/ops/ops/logical.py +++ b/src/pydiverse/transform/_internal/ops/ops/logical.py @@ -4,10 +4,134 @@ from pydiverse.transform._internal.ops.signature import Signature from pydiverse.transform._internal.tree.types import Bool -bool_and = Operator("__and__", Signature(Bool(), Bool(), return_type=Bool())) +bool_and = Operator( + "__and__", + Signature(Bool(), Bool(), return_type=Bool()), + doc=""" +Boolean AND (__and__) -bool_or = Operator("__or__", Signature(Bool(), Bool(), return_type=Bool())) +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [True, True, True, False, False, None], +... "b": [True, False, None, False, None, None], +... }, +... name="bool table", +... ) +>>> t >> mutate(x=t.a & t.b) >> show() +Table bool table, backend: PolarsImpl +shape: (6, 3) +┌───────┬───────┬───────┐ +│ a ┆ b ┆ x │ +│ --- ┆ --- ┆ --- │ +│ bool ┆ bool ┆ bool │ +╞═══════╪═══════╪═══════╡ +│ true ┆ true ┆ true │ +│ true ┆ false ┆ false │ +│ true ┆ null ┆ null │ +│ false ┆ false ┆ false │ +│ false ┆ null ┆ false │ +│ null ┆ null ┆ null │ +└───────┴───────┴───────┘ +""", +) -bool_xor = Operator("__xor__", Signature(Bool(), Bool(), return_type=Bool())) +bool_or = Operator( + "__or__", + Signature(Bool(), Bool(), return_type=Bool()), + doc=""" +Boolean OR (__or__) -bool_invert = Operator("__invert__", Signature(Bool(), return_type=Bool())) +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [True, True, True, False, False, None], +... "b": [True, False, None, False, None, None], +... }, +... name="bool table", +... ) +>>> t >> mutate(x=t.a | t.b) >> show() +Table bool table, backend: PolarsImpl +shape: (6, 3) +┌───────┬───────┬───────┐ +│ a ┆ b ┆ x │ +│ --- ┆ --- ┆ --- │ +│ bool ┆ bool ┆ bool │ +╞═══════╪═══════╪═══════╡ +│ true ┆ true ┆ true │ +│ true ┆ false ┆ true │ +│ true ┆ null ┆ true │ +│ false ┆ false ┆ false │ +│ false ┆ null ┆ null │ +│ null ┆ null ┆ null │ +└───────┴───────┴───────┘ +""", +) + +bool_xor = Operator( + "__xor__", + Signature(Bool(), Bool(), return_type=Bool()), + doc=""" +Boolean XOR (__xor__) + +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [True, True, True, False, False, None], +... "b": [True, False, None, False, None, None], +... }, +... name="bool table", +... ) +>>> t >> mutate(x=t.a ^ t.b) >> show() +Table bool table, backend: PolarsImpl +shape: (6, 3) +┌───────┬───────┬───────┐ +│ a ┆ b ┆ x │ +│ --- ┆ --- ┆ --- │ +│ bool ┆ bool ┆ bool │ +╞═══════╪═══════╪═══════╡ +│ true ┆ true ┆ false │ +│ true ┆ false ┆ true │ +│ true ┆ null ┆ null │ +│ false ┆ false ┆ false │ +│ false ┆ null ┆ null │ +│ null ┆ null ┆ null │ +└───────┴───────┴───────┘ +""", +) + +bool_invert = Operator( + "__invert__", + Signature(Bool(), return_type=Bool()), + doc=""" +Boolean inversion (__invert__) + +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [True, True, True, False, False, None], +... "b": [True, False, None, False, None, None], +... }, +... name="bool table", +... ) +>>> t >> mutate(x=~t.a) >> show() +Table bool table, backend: PolarsImpl +shape: (6, 3) +┌───────┬───────┬───────┐ +│ a ┆ b ┆ x │ +│ --- ┆ --- ┆ --- │ +│ bool ┆ bool ┆ bool │ +╞═══════╪═══════╪═══════╡ +│ true ┆ true ┆ false │ +│ true ┆ false ┆ false │ +│ true ┆ null ┆ false │ +│ false ┆ false ┆ true │ +│ false ┆ null ┆ true │ +│ null ┆ null ┆ null │ +└───────┴───────┴───────┘ +""", +) diff --git a/src/pydiverse/transform/_internal/ops/ops/numeric.py b/src/pydiverse/transform/_internal/ops/ops/numeric.py index 6260323..82b6a3f 100644 --- a/src/pydiverse/transform/_internal/ops/ops/numeric.py +++ b/src/pydiverse/transform/_internal/ops/ops/numeric.py @@ -9,38 +9,85 @@ Signature(Int(), Int(), return_type=Float()), Signature(Float(), Float(), return_type=Float()), Signature(Decimal(), Decimal(), return_type=Decimal()), + doc=""" +Computes the power x ** y. + +Note +---- +Polars throws on negative exponents in the integer case. A polars error like +`failed to convert X to u32` may be due to negative inputs to this function. +""", ) -neg = Operator("__neg__", *(Signature(t, return_type=t) for t in NUMERIC)) +neg = Operator( + "__neg__", + *(Signature(t, return_type=t) for t in NUMERIC), + doc="The unary - (negation) operator (__neg__)", +) -pos = Operator("__pos__", *(Signature(t, return_type=t) for t in NUMERIC)) +pos = Operator( + "__pos__", + *(Signature(t, return_type=t) for t in NUMERIC), + doc="The unary + operator (__pos__)", +) -abs = Operator("abs", *(Signature(t, return_type=t) for t in NUMERIC)) +abs = Operator( + "abs", + *(Signature(t, return_type=t) for t in NUMERIC), + doc="Computes the absolute value.", +) round = Operator( "round", *(Signature(t, Int(const=True), return_type=t) for t in NUMERIC), param_names=["self", "decimals"], default_values=[..., 0], + doc=""" +Rounds to a given number of decimals. + +:param decimals: + The number of decimals to round by. +""", ) floor = Operator( "floor", Signature(Float(), return_type=Float()), Signature(Decimal(), return_type=Decimal()), + doc="Returns the largest integer less than or equal to the input.", ) ceil = Operator( "ceil", Signature(Float(), return_type=Float()), Signature(Decimal(), return_type=Decimal()), + doc="Returns the smallest integer greater than or equal to the input.", ) -log = Operator("log", Signature(Float(), return_type=Float())) +log = Operator( + "log", + Signature(Float(), return_type=Float()), + doc="Computes the natural logarithm.", +) -exp = Operator("exp", Signature(Float(), return_type=Float())) +exp = Operator( + "exp", + Signature(Float(), return_type=Float()), + doc="Computes the exponential function.", +) -is_inf = Operator("is_inf", Signature(Float(), return_type=Bool())) +is_inf = Operator( + "is_inf", + Signature(Float(), return_type=Bool()), + doc=""" +Whether the number is infinite. + +Note +---- +This is currently only useful for backends supporting IEEE 754-floats. On +other backends it always returns False. +""", +) is_not_inf = Operator("is_not_inf", Signature(Float(), return_type=Bool())) diff --git a/src/pydiverse/transform/_internal/ops/ops/string.py b/src/pydiverse/transform/_internal/ops/ops/string.py index abca639..1640f58 100644 --- a/src/pydiverse/transform/_internal/ops/ops/string.py +++ b/src/pydiverse/transform/_internal/ops/ops/string.py @@ -10,28 +10,260 @@ def __init__(self, name: str, doc: str = ""): super().__init__(name, Signature(String(), return_type=String()), doc=doc) -str_strip = StrUnary("str.strip") -str_upper = StrUnary("str.upper") -str_lower = StrUnary("str.lower") +str_strip = StrUnary( + "str.strip", + doc=""" +Removes leading and trailing whitespace. -str_len = Operator("str.len", Signature(String(), return_type=Int())) +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [" BCD ", "-- 00", " A^^u", "-O2"], +... "b": ["12431", "transform", "12__*m", " "], +... }, +... name="string table", +... ) +>>> t >> mutate(j=t.a.str.strip(), k=t.b.str.strip()) >> show() +Table string table, backend: PolarsImpl +shape: (4, 4) +┌────────┬───────────┬───────┬───────────┐ +│ a ┆ b ┆ j ┆ k │ +│ --- ┆ --- ┆ --- ┆ --- │ +│ str ┆ str ┆ str ┆ str │ +╞════════╪═══════════╪═══════╪═══════════╡ +│ BCD ┆ 12431 ┆ BCD ┆ 12431 │ +│ -- 00 ┆ transform ┆ -- 00 ┆ transform │ +│ A^^u ┆ 12__*m ┆ A^^u ┆ 12__*m │ +│ -O2 ┆ ┆ -O2 ┆ │ +└────────┴───────────┴───────┴───────────┘ +""", +) +str_upper = StrUnary( + "str.upper", + doc=""" +Converts all alphabet letters to upper case. + +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [" BCD ", "-- 00", " A^^u", "-O2"], +... "b": ["12431", "transform", "12__*m", " "], +... }, +... name="string table", +... ) +>>> t >> mutate(j=t.a.str.upper(), k=t.b.str.upper()) >> show() +Table string table, backend: PolarsImpl +shape: (4, 4) +┌────────┬───────────┬────────┬───────────┐ +│ a ┆ b ┆ j ┆ k │ +│ --- ┆ --- ┆ --- ┆ --- │ +│ str ┆ str ┆ str ┆ str │ +╞════════╪═══════════╪════════╪═══════════╡ +│ BCD ┆ 12431 ┆ BCD ┆ 12431 │ +│ -- 00 ┆ transform ┆ -- 00 ┆ TRANSFORM │ +│ A^^u ┆ 12__*m ┆ A^^U ┆ 12__*M │ +│ -O2 ┆ ┆ -O2 ┆ │ +└────────┴───────────┴────────┴───────────┘ +""", +) +str_lower = StrUnary( + "str.lower", + doc=""" +Converts all alphabet letters to lower case. + +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [" BCD ", "-- 00", " A^^u", "-O2"], +... "b": ["12431", "transform", "12__*m", " "], +... }, +... name="string table", +... ) +>>> t >> mutate(j=t.a.str.lower(), k=t.b.str.lower()) >> show() +Table string table, backend: PolarsImpl +shape: (4, 4) +┌────────┬───────────┬────────┬───────────┐ +│ a ┆ b ┆ j ┆ k │ +│ --- ┆ --- ┆ --- ┆ --- │ +│ str ┆ str ┆ str ┆ str │ +╞════════╪═══════════╪════════╪═══════════╡ +│ BCD ┆ 12431 ┆ bcd ┆ 12431 │ +│ -- 00 ┆ transform ┆ -- 00 ┆ transform │ +│ A^^u ┆ 12__*m ┆ a^^u ┆ 12__*m │ +│ -O2 ┆ ┆ -o2 ┆ │ +└────────┴───────────┴────────┴───────────┘ +""", +) + +# We should write something about number of chars vs number of bytes here. +str_len = Operator( + "str.len", + Signature(String(), return_type=Int()), + doc=""" +Computes the length of the string. + +Leading and trailing whitespace is included in the length. + +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [" BCD ", "-- 00", " A^^u", "-O2"], +... "b": ["12431", "transform", "12__*m", " "], +... }, +... name="string table", +... ) +>>> t >> mutate(j=t.a.str.len(), k=t.b.str.len()) >> show() +Table string table, backend: PolarsImpl +shape: (4, 4) +┌────────┬───────────┬─────┬─────┐ +│ a ┆ b ┆ j ┆ k │ +│ --- ┆ --- ┆ --- ┆ --- │ +│ str ┆ str ┆ i64 ┆ i64 │ +╞════════╪═══════════╪═════╪═════╡ +│ BCD ┆ 12431 ┆ 6 ┆ 5 │ +│ -- 00 ┆ transform ┆ 5 ┆ 9 │ +│ A^^u ┆ 12__*m ┆ 5 ┆ 6 │ +│ -O2 ┆ ┆ 3 ┆ 3 │ +└────────┴───────────┴─────┴─────┘ +""", +) str_replace_all = Operator( "str.replace_all", Signature(String(), String(const=True), String(const=True), return_type=String()), param_names=["self", "substr", "replacement"], + doc=""" +Replaces all occurrences of a given substring by a different string. + +:param substr: + The string to replace. + +:param replacement: + The replacement string. + +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [" BCD ", "-- 00", " A^^u", "-O2", ""], +... "b": ["12431", "transform", "12__*m", " ", "abbabbabba"], +... }, +... name="string table", +... ) +>>> ( +... t +... >> mutate( +... r=t.a.str.replace_all("-", "?"), +... s=t.b.str.replace_all("ansf", "[---]"), +... u=t.b.str.replace_all("abba", "#"), +... ) +... >> show() +... ) +Table string table, backend: PolarsImpl +shape: (5, 5) +┌────────┬────────────┬────────┬────────────┬───────────┐ +│ a ┆ b ┆ r ┆ s ┆ u │ +│ --- ┆ --- ┆ --- ┆ --- ┆ --- │ +│ str ┆ str ┆ str ┆ str ┆ str │ +╞════════╪════════════╪════════╪════════════╪═══════════╡ +│ BCD ┆ 12431 ┆ BCD ┆ 12431 ┆ 12431 │ +│ -- 00 ┆ transform ┆ ?? 00 ┆ tr[---]orm ┆ transform │ +│ A^^u ┆ 12__*m ┆ A^^u ┆ 12__*m ┆ 12__*m │ +│ -O2 ┆ ┆ ?O2 ┆ ┆ │ +│ ┆ abbabbabba ┆ ┆ abbabbabba ┆ #bb# │ +└────────┴────────────┴────────┴────────────┴───────────┘ +""", ) str_starts_with = Operator( "str.starts_with", Signature(String(), String(const=True), return_type=Bool()), param_names=["self", "prefix"], + doc=""" +Whether the string starts with a given prefix. + +:param prefix: + The prefix to check. + +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [" BCD ", "-- 00", " A^^u", "-O2", ""], +... "b": ["12431", "transform", "12__*m", " ", "abbabbabba"], +... }, +... name="string table", +... ) +>>> ( +... t +... >> mutate( +... j=t.a.str.starts_with("-"), +... k=t.b.str.starts_with("12"), +... ) +... >> show() +... ) +Table string table, backend: PolarsImpl +shape: (5, 4) +┌────────┬────────────┬───────┬───────┐ +│ a ┆ b ┆ j ┆ k │ +│ --- ┆ --- ┆ --- ┆ --- │ +│ str ┆ str ┆ bool ┆ bool │ +╞════════╪════════════╪═══════╪═══════╡ +│ BCD ┆ 12431 ┆ false ┆ true │ +│ -- 00 ┆ transform ┆ true ┆ false │ +│ A^^u ┆ 12__*m ┆ false ┆ true │ +│ -O2 ┆ ┆ true ┆ false │ +│ ┆ abbabbabba ┆ false ┆ false │ +└────────┴────────────┴───────┴───────┘ +""", ) str_ends_with = Operator( "str.ends_with", Signature(String(), String(const=True), return_type=Bool()), param_names=["self", "suffix"], + doc=""" +Whether the string ends with a given suffix. + +:param suffix: + The suffix to check. + +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [" BCD ", "-- 00", " A^^u", "-O2", ""], +... "b": ["12431", "transform", "12__*m", " ", "abbabbabba"], +... }, +... name="string table", +... ) +>>> ( +... t +... >> mutate( +... j=t.a.str.ends_with(""), +... k=t.b.str.ends_with("m"), +... l=t.a.str.ends_with("^u"), +... ) +... >> show() +... ) +Table string table, backend: PolarsImpl +shape: (5, 5) +┌────────┬────────────┬──────┬───────┬───────┐ +│ a ┆ b ┆ j ┆ k ┆ l │ +│ --- ┆ --- ┆ --- ┆ --- ┆ --- │ +│ str ┆ str ┆ bool ┆ bool ┆ bool │ +╞════════╪════════════╪══════╪═══════╪═══════╡ +│ BCD ┆ 12431 ┆ true ┆ false ┆ false │ +│ -- 00 ┆ transform ┆ true ┆ true ┆ false │ +│ A^^u ┆ 12__*m ┆ true ┆ true ┆ true │ +│ -O2 ┆ ┆ true ┆ false ┆ false │ +│ ┆ abbabbabba ┆ true ┆ false ┆ false │ +└────────┴────────────┴──────┴───────┴───────┘ +""", ) @@ -39,12 +271,91 @@ def __init__(self, name: str, doc: str = ""): "str.contains", Signature(String(), String(const=True), return_type=Bool()), param_names=["self", "substr"], + doc=""" +Whether the string contains a given substring. + +:param substr: + The substring to look for. + +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [" BCD ", "-- 00", " A^^u", "-O2", ""], +... "b": ["12431", "transform", "12__*m", " ", "abbabbabba"], +... }, +... name="string table", +... ) +>>> ( +... t +... >> mutate( +... j=t.a.str.contains(" "), +... k=t.b.str.contains("a"), +... l=t.b.str.contains(""), +... ) +... >> show() +... ) +Table string table, backend: PolarsImpl +shape: (5, 5) +┌────────┬────────────┬───────┬───────┬──────┐ +│ a ┆ b ┆ j ┆ k ┆ l │ +│ --- ┆ --- ┆ --- ┆ --- ┆ --- │ +│ str ┆ str ┆ bool ┆ bool ┆ bool │ +╞════════╪════════════╪═══════╪═══════╪══════╡ +│ BCD ┆ 12431 ┆ true ┆ false ┆ true │ +│ -- 00 ┆ transform ┆ true ┆ true ┆ true │ +│ A^^u ┆ 12__*m ┆ true ┆ false ┆ true │ +│ -O2 ┆ ┆ false ┆ false ┆ true │ +│ ┆ abbabbabba ┆ false ┆ true ┆ true │ +└────────┴────────────┴───────┴───────┴──────┘ +""", ) str_slice = Operator( "str.slice", Signature(String(), Int(), Int(), return_type=String()), param_names=["self", "offset", "n"], + doc=""" +Returns a substring of the input string. + +:param offset: + The 0-based index of the first character included in the result. + +:param n: + The number of characters to include. If the string is shorter than *offset* + + *n*, the result only includes as many characters as there are. + +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [" BCD ", "-- 00", " A^^u", "-O2", ""], +... "b": ["12431", "transform", "12__*m", " ", "abbabbabba"], +... }, +... name="string table", +... ) +>>> ( +... t +... >> mutate( +... j=t.a.str.slice(0, 2), +... k=t.b.str.slice(4, 10), +... ) +... >> show() +... ) +Table string table, backend: PolarsImpl +shape: (5, 4) +┌────────┬────────────┬─────┬────────┐ +│ a ┆ b ┆ j ┆ k │ +│ --- ┆ --- ┆ --- ┆ --- │ +│ str ┆ str ┆ str ┆ str │ +╞════════╪════════════╪═════╪════════╡ +│ BCD ┆ 12431 ┆ ┆ 1 │ +│ -- 00 ┆ transform ┆ -- ┆ sform │ +│ A^^u ┆ 12__*m ┆ A ┆ *m │ +│ -O2 ┆ ┆ -O ┆ │ +│ ┆ abbabbabba ┆ ┆ bbabba │ +└────────┴────────────┴─────┴────────┘ +""", ) str_to_datetime = Operator( diff --git a/src/pydiverse/transform/_internal/ops/ops/window.py b/src/pydiverse/transform/_internal/ops/ops/window.py index 1965b3f..d7fbad7 100644 --- a/src/pydiverse/transform/_internal/ops/ops/window.py +++ b/src/pydiverse/transform/_internal/ops/ops/window.py @@ -15,6 +15,7 @@ def __init__( param_names: list[str] | None = None, default_values: list[Any] | None = None, generate_expr_method=False, + arrange_required=True, doc: str = "", ): super().__init__( @@ -23,7 +24,7 @@ def __init__( ftype=Ftype.WINDOW, context_kwargs=[ ContextKwarg("partition_by", False), - ContextKwarg("arrange", True), + ContextKwarg("arrange", arrange_required), ], param_names=param_names, default_values=default_values, @@ -38,9 +39,90 @@ def __init__( param_names=["self", "n", "fill_value"], default_values=[..., ..., None], generate_expr_method=True, + arrange_required=False, + doc=""" +Shifts values in the column by an offset. + +:param n: + The number of places to shift by. May be negative. + +:param fill_value: + The value to write to the empty spaces created by the shift. Defaults to + null. + +Examples +-------- +>>> t = pdt.Table( +... { +... "a": [5, -1, 435, -34, 8, None, 0], +... "b": ["r", "True", "??", ". .", "-1/12", "abc", "#"], +... } +... ) +>>> ( +... t +... >> mutate( +... x=t.a.shift(2, -40), +... y=t.b.shift(1, arrange=t.a.nulls_last()), +... ) +... >> show() +... ) +Table , backend: PolarsImpl +shape: (7, 4) +┌──────┬───────┬─────┬───────┐ +│ a ┆ b ┆ x ┆ y │ +│ --- ┆ --- ┆ --- ┆ --- │ +│ i64 ┆ str ┆ i64 ┆ str │ +╞══════╪═══════╪═════╪═══════╡ +│ 5 ┆ r ┆ -40 ┆ # │ +│ -1 ┆ True ┆ -40 ┆ . . │ +│ 435 ┆ ?? ┆ 5 ┆ -1/12 │ +│ -34 ┆ . . ┆ -1 ┆ null │ +│ 8 ┆ -1/12 ┆ 435 ┆ r │ +│ null ┆ abc ┆ -34 ┆ ?? │ +│ 0 ┆ # ┆ 8 ┆ True │ +└──────┴───────┴─────┴───────┘ +""", ) -row_number = Window("row_number", Signature(return_type=Int())) +row_number = Window( + "row_number", + Signature(return_type=Int()), + arrange_required=False, + doc=""" +Computes the index of a row. + +Via the *arrange* argument, this can be done relative to a different order of +the rows. But note that the result may not be unique if the argument of +*arrange* contains duplicates. + +Examples +-------- +>>> t = pdt.Table({"a": [5, -1, 435, -34, 8, None, 0]}) +>>> ( +... t +... >> mutate( +... x=pdt.row_number(), +... y=pdt.row_number(arrange=t.a), +... ) +... >> show() +... ) +Table , backend: PolarsImpl +shape: (7, 3) +┌──────┬─────┬─────┐ +│ a ┆ x ┆ y │ +│ --- ┆ --- ┆ --- │ +│ i64 ┆ i64 ┆ i64 │ +╞══════╪═════╪═════╡ +│ 5 ┆ 1 ┆ 5 │ +│ -1 ┆ 2 ┆ 3 │ +│ 435 ┆ 3 ┆ 7 │ +│ -34 ┆ 4 ┆ 2 │ +│ 8 ┆ 5 ┆ 6 │ +│ null ┆ 6 ┆ 1 │ +│ 0 ┆ 7 ┆ 4 │ +└──────┴─────┴─────┘ +""", +) rank = Window( "rank", @@ -48,27 +130,75 @@ def __init__( doc=""" The number of strictly smaller elements in the column plus one. -This is the same as ``rank("min")`` in polars. +This is the same as ``rank("min")`` in polars. This function has two syntax +alternatives, as shown in the example below. The pdt. version is a bit more +flexible, because it allows sorting by multiple expressions. + Examples -------- ->>> t = pdt.Table({"a": [3, 1, 4, 1, 5, 9, 4]}) ->>> t >> mutate(b=pdt.rank(arrange=t.a)) >> export(Polars(lazy=False)) -shape: (7, 2) -┌─────┬─────┐ -│ a ┆ b │ -│ --- ┆ --- │ -│ i64 ┆ i64 │ -╞═════╪═════╡ -│ 3 ┆ 3 │ -│ 1 ┆ 1 │ -│ 4 ┆ 4 │ -│ 1 ┆ 1 │ -│ 5 ┆ 6 │ -│ 9 ┆ 7 │ -│ 4 ┆ 4 │ -└─────┴─────┘ +>>> t = pdt.Table({"a": [5, -1, 435, -1, 8, None, 8]}) +>>> ( +... t +... >> mutate( +... x=t.a.nulls_first().rank(), +... y=pdt.rank(arrange=t.a.nulls_first()), +... ) +... >> show() +... ) +Table , backend: PolarsImpl +shape: (7, 3) +┌──────┬─────┬─────┐ +│ a ┆ x ┆ y │ +│ --- ┆ --- ┆ --- │ +│ i64 ┆ i64 ┆ i64 │ +╞══════╪═════╪═════╡ +│ 5 ┆ 4 ┆ 4 │ +│ -1 ┆ 2 ┆ 2 │ +│ 435 ┆ 7 ┆ 7 │ +│ -1 ┆ 2 ┆ 2 │ +│ 8 ┆ 5 ┆ 5 │ +│ null ┆ 1 ┆ 1 │ +│ 8 ┆ 5 ┆ 5 │ +└──────┴─────┴─────┘ """, ) -dense_rank = Window("dense_rank", Signature(return_type=Int())) +dense_rank = Window( + "dense_rank", + Signature(return_type=Int()), + doc=""" +The number of smaller or equal values in the column (not counting duplicates). + +This function has two syntax alternatives, as shown in the example below. The +pdt. version is a bit more flexible, because it allows sorting by multiple +expressions. + +Examples +-------- +>>> t = pdt.Table({"a": [5, -1, 435, -1, 8, None, 8]}) +>>> ( +... t +... >> mutate( +... x=t.a.nulls_first().dense_rank(), +... y=pdt.dense_rank(arrange=t.a.nulls_first()), +... ) +... >> show() +... ) +Table , backend: PolarsImpl +shape: (7, 3) +┌──────┬─────┬─────┐ +│ a ┆ x ┆ y │ +│ --- ┆ --- ┆ --- │ +│ i64 ┆ i64 ┆ i64 │ +╞══════╪═════╪═════╡ +│ 5 ┆ 3 ┆ 3 │ +│ -1 ┆ 2 ┆ 2 │ +│ 435 ┆ 5 ┆ 5 │ +│ -1 ┆ 2 ┆ 2 │ +│ 8 ┆ 4 ┆ 4 │ +│ null ┆ 1 ┆ 1 │ +│ 8 ┆ 4 ┆ 4 │ +└──────┴─────┴─────┘ +""", +) diff --git a/src/pydiverse/transform/_internal/pipe/functions.py b/src/pydiverse/transform/_internal/pipe/functions.py index 77f294e..e5f2f7a 100644 --- a/src/pydiverse/transform/_internal/pipe/functions.py +++ b/src/pydiverse/transform/_internal/pipe/functions.py @@ -2,8 +2,6 @@ from __future__ import annotations -import functools -import operator from collections.abc import Iterable from typing import Any, overload @@ -24,6 +22,7 @@ Datetime, Decimal, Dtype, + Duration, Float, Int, String, @@ -42,44 +41,64 @@ def when(condition: ColExpr) -> WhenClause: def lit(val: Any, dtype: Dtype | None = None) -> LiteralCol: + """ + Creates a pydiverse.transform expression from a python builtin type. + + Usually, you can just use python builtins in expressions without wrapping them in + ``lit``. The pydiverse.transform data type of the value is then inferred. However, + ``lit`` allows to set the exact pydiverse.transform type, which may be useful + sometimes. + """ if dtype is not None and types.is_subtype(dtype): return LiteralCol(val, dtype).cast(dtype) return LiteralCol(val, dtype) -def all(arg: ColExpr[Bool], *args: ColExpr[Bool]) -> ColExpr[Bool]: - return functools.reduce(operator.and_, (arg, *args)) - - -def any(arg: ColExpr[Bool], *args: ColExpr[Bool]) -> ColExpr[Bool]: - return functools.reduce(operator.or_, (arg, *args)) - - -@overload -def sum(arg: ColExpr[Int], *args: ColExpr[Int]) -> ColExpr[Int]: ... - - -@overload -def sum(arg: ColExpr[Float], *args: ColExpr[Float]) -> ColExpr[Float]: ... - - -@overload -def sum(arg: ColExpr[Decimal], *args: ColExpr[Decimal]) -> ColExpr[Decimal]: ... - - -@overload -def sum(arg: ColExpr[String], *args: ColExpr[String]) -> ColExpr[String]: ... - +# --- from here the code is generated, do not delete this comment --- -def sum(arg: ColExpr, *args: ColExpr) -> ColExpr: - return functools.reduce(operator.add, (arg, *args)) +def coalesce(arg: ColExpr, *args: ColExpr) -> ColExpr: + """ + Returns the first non-null value among the given. -# --- from here the code is generated, do not delete this comment --- + :param arg: + The first value. + :param args: + Further values. All must have the same type. -def coalesce(arg: ColExpr, *args: ColExpr) -> ColExpr: - """""" + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [5, None, 435, -1, 8, None], + ... "b": [-45, None, 6, 23, 1, 0], + ... "c": [10, 2, None, None, None, None], + ... } + ... ) + >>> ( + ... t + ... >> mutate( + ... x=pdt.coalesce(t.a, t.b, t.c), + ... y=pdt.coalesce(t.c, t.b, t.a), + ... ) + ... >> show() + ... ) + Table , backend: PolarsImpl + shape: (6, 5) + ┌──────┬──────┬──────┬─────┬─────┐ + │ a ┆ b ┆ c ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╪═════╪═════╡ + │ 5 ┆ -45 ┆ 10 ┆ 5 ┆ 10 │ + │ null ┆ null ┆ 2 ┆ 2 ┆ 2 │ + │ 435 ┆ 6 ┆ null ┆ 435 ┆ 6 │ + │ -1 ┆ 23 ┆ null ┆ -1 ┆ 23 │ + │ 8 ┆ 1 ┆ null ┆ 8 ┆ 1 │ + │ null ┆ 0 ┆ null ┆ 0 ┆ 0 │ + └──────┴──────┴──────┴─────┴─────┘ + """ return ColFn(ops.coalesce, arg, *args) @@ -101,11 +120,56 @@ def dense_rank( partition_by: Col | ColName | Iterable[Col | ColName] | None = None, arrange: ColExpr | Iterable[ColExpr], ) -> ColExpr[Int]: - """""" + """ + The number of smaller or equal values in the column (not counting duplicates). + + This function has two syntax alternatives, as shown in the example below. The + pdt. version is a bit more flexible, because it allows sorting by multiple + expressions. + + Examples + -------- + >>> t = pdt.Table({"a": [5, -1, 435, -1, 8, None, 8]}) + >>> ( + ... t + ... >> mutate( + ... x=t.a.nulls_first().dense_rank(), + ... y=pdt.dense_rank(arrange=t.a.nulls_first()), + ... ) + ... >> show() + ... ) + Table , backend: PolarsImpl + shape: (7, 3) + ┌──────┬─────┬─────┐ + │ a ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 5 ┆ 3 ┆ 3 │ + │ -1 ┆ 2 ┆ 2 │ + │ 435 ┆ 5 ┆ 5 │ + │ -1 ┆ 2 ┆ 2 │ + │ 8 ┆ 4 ┆ 4 │ + │ null ┆ 1 ┆ 1 │ + │ 8 ┆ 4 ┆ 4 │ + └──────┴─────┴─────┘ + """ return ColFn(ops.dense_rank, partition_by=partition_by, arrange=arrange) +def all(arg: ColExpr[Bool], *args: ColExpr[Bool]) -> ColExpr[Bool]: + """""" + + return ColFn(ops.horizontal_all, arg, *args) + + +def any(arg: ColExpr[Bool], *args: ColExpr[Bool]) -> ColExpr[Bool]: + """""" + + return ColFn(ops.horizontal_any, arg, *args) + + @overload def max(arg: ColExpr[Int], *args: ColExpr[Int]) -> ColExpr[Int]: ... @@ -131,7 +195,34 @@ def max(arg: ColExpr[Date], *args: ColExpr[Date]) -> ColExpr[Date]: ... def max(arg: ColExpr, *args: ColExpr) -> ColExpr: - """""" + """ + The maximum of the given columns. + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [5, None, 435, -1, 8, None], + ... "b": [-45, None, 6, 23, -1, 0], + ... "c": [10, None, 2, None, -53, 3], + ... } + ... ) + >>> t >> mutate(x=pdt.max(t.a, t.b, t.c)) >> show() + Table , backend: PolarsImpl + shape: (6, 4) + ┌──────┬──────┬──────┬──────┐ + │ a ┆ b ┆ c ┆ x │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╪══════╡ + │ 5 ┆ -45 ┆ 10 ┆ 10 │ + │ null ┆ null ┆ null ┆ null │ + │ 435 ┆ 6 ┆ 2 ┆ 435 │ + │ -1 ┆ 23 ┆ null ┆ 23 │ + │ 8 ┆ -1 ┆ -53 ┆ 8 │ + │ null ┆ 0 ┆ 3 ┆ 3 │ + └──────┴──────┴──────┴──────┘ + """ return ColFn(ops.horizontal_max, arg, *args) @@ -161,11 +252,64 @@ def min(arg: ColExpr[Date], *args: ColExpr[Date]) -> ColExpr[Date]: ... def min(arg: ColExpr, *args: ColExpr) -> ColExpr: - """""" + """ + The minimum of the given columns. + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [5, None, 435, -1, 8, None], + ... "b": [-45, None, 6, 23, -1, 0], + ... "c": [10, None, 2, None, -53, 3], + ... } + ... ) + >>> t >> mutate(x=pdt.min(t.a, t.b, t.c)) >> show() + Table , backend: PolarsImpl + shape: (6, 4) + ┌──────┬──────┬──────┬──────┐ + │ a ┆ b ┆ c ┆ x │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 ┆ i64 │ + ╞══════╪══════╪══════╪══════╡ + │ 5 ┆ -45 ┆ 10 ┆ -45 │ + │ null ┆ null ┆ null ┆ null │ + │ 435 ┆ 6 ┆ 2 ┆ 2 │ + │ -1 ┆ 23 ┆ null ┆ -1 │ + │ 8 ┆ -1 ┆ -53 ┆ -53 │ + │ null ┆ 0 ┆ 3 ┆ 0 │ + └──────┴──────┴──────┴──────┘ + """ return ColFn(ops.horizontal_min, arg, *args) +@overload +def sum(arg: ColExpr[Int], *args: ColExpr[Int]) -> ColExpr[Int]: ... + + +@overload +def sum(arg: ColExpr[Float], *args: ColExpr[Float]) -> ColExpr[Float]: ... + + +@overload +def sum(arg: ColExpr[Decimal], *args: ColExpr[Decimal]) -> ColExpr[Decimal]: ... + + +@overload +def sum(arg: ColExpr[String], *args: ColExpr[String]) -> ColExpr[String]: ... + + +@overload +def sum(arg: ColExpr[Duration], *args: ColExpr[Duration]) -> ColExpr[Duration]: ... + + +def sum(arg: ColExpr, *args: ColExpr) -> ColExpr: + """""" + + return ColFn(ops.horizontal_sum, arg, *args) + + def rank( *, partition_by: Col | ColName | Iterable[Col | ColName] | None = None, @@ -174,26 +318,37 @@ def rank( """ The number of strictly smaller elements in the column plus one. - This is the same as ``rank("min")`` in polars. + This is the same as ``rank("min")`` in polars. This function has two syntax + alternatives, as shown in the example below. The pdt. version is a bit more + flexible, because it allows sorting by multiple expressions. + Examples -------- - >>> t = pdt.Table({"a": [3, 1, 4, 1, 5, 9, 4]}) - >>> t >> mutate(b=pdt.rank(arrange=t.a)) >> export(Polars(lazy=False)) - shape: (7, 2) - ┌─────┬─────┐ - │ a ┆ b │ - │ --- ┆ --- │ - │ i64 ┆ i64 │ - ╞═════╪═════╡ - │ 3 ┆ 3 │ - │ 1 ┆ 1 │ - │ 4 ┆ 4 │ - │ 1 ┆ 1 │ - │ 5 ┆ 6 │ - │ 9 ┆ 7 │ - │ 4 ┆ 4 │ - └─────┴─────┘ + >>> t = pdt.Table({"a": [5, -1, 435, -1, 8, None, 8]}) + >>> ( + ... t + ... >> mutate( + ... x=t.a.nulls_first().rank(), + ... y=pdt.rank(arrange=t.a.nulls_first()), + ... ) + ... >> show() + ... ) + Table , backend: PolarsImpl + shape: (7, 3) + ┌──────┬─────┬─────┐ + │ a ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 5 ┆ 4 ┆ 4 │ + │ -1 ┆ 2 ┆ 2 │ + │ 435 ┆ 7 ┆ 7 │ + │ -1 ┆ 2 ┆ 2 │ + │ 8 ┆ 5 ┆ 5 │ + │ null ┆ 1 ┆ 1 │ + │ 8 ┆ 5 ┆ 5 │ + └──────┴─────┴─────┘ """ return ColFn(ops.rank, partition_by=partition_by, arrange=arrange) @@ -202,8 +357,41 @@ def rank( def row_number( *, partition_by: Col | ColName | Iterable[Col | ColName] | None = None, - arrange: ColExpr | Iterable[ColExpr], + arrange: ColExpr | Iterable[ColExpr] | None = None, ) -> ColExpr[Int]: - """""" + """ + Computes the index of a row. + + Via the *arrange* argument, this can be done relative to a different order of + the rows. But note that the result may not be unique if the argument of + *arrange* contains duplicates. + + Examples + -------- + >>> t = pdt.Table({"a": [5, -1, 435, -34, 8, None, 0]}) + >>> ( + ... t + ... >> mutate( + ... x=pdt.row_number(), + ... y=pdt.row_number(arrange=t.a), + ... ) + ... >> show() + ... ) + Table , backend: PolarsImpl + shape: (7, 3) + ┌──────┬─────┬─────┐ + │ a ┆ x ┆ y │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ i64 │ + ╞══════╪═════╪═════╡ + │ 5 ┆ 1 ┆ 5 │ + │ -1 ┆ 2 ┆ 3 │ + │ 435 ┆ 3 ┆ 7 │ + │ -34 ┆ 4 ┆ 2 │ + │ 8 ┆ 5 ┆ 6 │ + │ null ┆ 6 ┆ 1 │ + │ 0 ┆ 7 ┆ 4 │ + └──────┴─────┴─────┘ + """ return ColFn(ops.row_number, partition_by=partition_by, arrange=arrange) diff --git a/src/pydiverse/transform/_internal/pipe/pipeable.py b/src/pydiverse/transform/_internal/pipe/pipeable.py index be6a843..873c4a8 100644 --- a/src/pydiverse/transform/_internal/pipe/pipeable.py +++ b/src/pydiverse/transform/_internal/pipe/pipeable.py @@ -14,9 +14,9 @@ def __init__(self, f=None, calls=None): def __rshift__(self, other) -> Pipeable: """ - Pipeable >> other - -> Lazy. Extend pipe. + The pipe operator for chaining verbs. """ + if isinstance(other, Pipeable): return Pipeable(calls=self.calls + other.calls) elif callable(other): diff --git a/src/pydiverse/transform/_internal/pipe/table.py b/src/pydiverse/transform/_internal/pipe/table.py index 0927a4d..071cfc5 100644 --- a/src/pydiverse/transform/_internal/pipe/table.py +++ b/src/pydiverse/transform/_internal/pipe/table.py @@ -31,11 +31,129 @@ class Table: which is a reference to the underlying abstract syntax tree. """ - # TODO: define exactly what can be given for the two and do type checks - # maybe call the second one execution_engine or similar? def __init__( self, resource: Any, backend: Target | None = None, *, name: str | None = None ): + """ + Creates a new table. + + :param resource: + The data source to construct the table from. This can be a polars or pandas + data frame, a python dictionary, a SQLAlchemy table or the name of a table + in a SQL database. + + :param backend: + The execution backend. This must be one of the pydiverse.transform backend + objects, see :doc:`targets`. It may carry additional information how to + interpret the *resource* argument, such as a SQLAlchemy engine. + + :param name: + The name of the table. It is not required to give the table a name, but may + make print output more readable. + + Examples + -------- + **Python dictionary**. + + >>> t = pdt.Table( + ... { + ... "a": [4, 3, -35, 24, 105], + ... "b": [4, 4, 0, -23, 42], + ... }, + ... name="T", + ... ) + >>> t >> show() + Table T, backend: PolarsImpl + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ i64 │ + ╞═════╪═════╡ + │ 4 ┆ 4 │ + │ 3 ┆ 4 │ + │ -35 ┆ 0 │ + │ 24 ┆ -23 │ + │ 105 ┆ 42 │ + └─────┴─────┘ + + **Polars data frame.** + + >>> df = pl.DataFrame( + ... { + ... "a": [4, 3, -35, 24, 105], + ... "b": ["a", "o", "---", "i23", " "], + ... }, + ... ) + >>> t = pdt.Table(df, name="T") + >>> t >> show() + Table T, backend: PolarsImpl + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 4 ┆ a │ + │ 3 ┆ o │ + │ -35 ┆ --- │ + │ 24 ┆ i23 │ + │ 105 ┆ │ + └─────┴─────┘ + + **Pandas data frame.** Note that the data frame is converted to a polars data + frame and the backend is polars. + + >>> import pandas as pd + >>> df = pd.DataFrame( + ... { + ... "a": [4, 3, -35, 24, 105], + ... "b": ["a", "o", "---", "i23", " "], + ... }, + ... ) + >>> t = pdt.Table(df, name="T") + >>> t >> show() + Table T, backend: PolarsImpl + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 4 ┆ a │ + │ 3 ┆ o │ + │ -35 ┆ --- │ + │ 24 ┆ i23 │ + │ 105 ┆ │ + └─────┴─────┘ + + **SQL.** Assuming you have a SQLAlchemy engine ``engine``, which is has a + connection to a database containing a table ``t1`` in a schema ``s1``, you can + create a pydiverse.transform Table from it as follows. + + >>> t = pdt.Table("t1", SqlAlchemy(engine, schema="s1")) + >>> t >> show() + Table t1, backend: PostgresImpl + shape: (5, 2) + ┌─────┬─────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ i64 ┆ str │ + ╞═════╪═════╡ + │ 4 ┆ a │ + │ 3 ┆ o │ + │ -35 ┆ --- │ + │ 24 ┆ i23 │ + │ 105 ┆ │ + └─────┴─────┘ + + Note that the name argument to the ``pdt.Table`` constructor was not specified, + so transform used the name of the SQL table. This example of course assumes that + a database connection is set up and the above table is already present in the + database. For more information on how to set up a connection, see + :doc:`/database_testing`. + """ + self._ast: AstNode = TableImpl.from_resource(resource, backend, name=name) self._cache = Cache( self._ast.cols, @@ -79,7 +197,11 @@ def __contains__(self, col: str | Col | ColName) -> bool: def __len__(self) -> int: return len(self._cache.select) - def __rshift__(self, rhs): + def __rshift__(self, rhs) -> Table: + """ + The pipe operator for chaining verbs. + """ + if isinstance(rhs, Pipeable): return rhs(self) if isinstance(rhs, Callable): diff --git a/src/pydiverse/transform/_internal/pipe/verbs.py b/src/pydiverse/transform/_internal/pipe/verbs.py index 55fc859..bdea51e 100644 --- a/src/pydiverse/transform/_internal/pipe/verbs.py +++ b/src/pydiverse/transform/_internal/pipe/verbs.py @@ -93,7 +93,7 @@ def alias(table: Table, new_name: str | None = None) -> Pipeable: >>> ( ... t ... >> join(s := t >> alias(), t.a == s.a, how="inner", suffix="_right") - ... >> export(Polars()) + ... >> show() ... ) shape: (6, 4) ┌─────┬────────┬─────────┬─────────┐ @@ -175,7 +175,7 @@ def collect(table: Table, target: Target | None = None) -> Pipeable: ... >> mutate(z=t.a + t.b.str.len()) ... >> collect() ... >> arrange(C.z, t.a) - ... >> export(Polars()) + ... >> show() ... ) shape: (4, 3) ┌─────┬────────┬─────┐ @@ -256,7 +256,7 @@ def export( 3 1 10.8 transform 4 5 -81.2 ' ' 5 9 0.0 -22 - >>> t1 >> export(Polars()) + >>> t1 >> show() shape: (6, 3) ┌─────┬────────┬───────────┐ │ a ┆ b ┆ c │ @@ -332,7 +332,7 @@ def select(table: Table, *cols: Col | ColName) -> Pipeable: Examples -------- >>> t = pdt.Table({"a": [3, 2, 6, 4], "b": ["lll", "g", "u0", "__**_"]}) - >>> t >> select(t.a) >> export(Polars()) + >>> t >> select(t.a) >> show() shape: (4, 1) ┌─────┐ │ a │ @@ -372,7 +372,7 @@ def drop(table: Table, *cols: Col | ColName) -> Pipeable: Examples -------- >>> t = pdt.Table({"a": [3, 2, 6, 4], "b": ["lll", "g", "u0", "__**_"]}) - >>> t >> drop(t.a) >> export(Polars()) + >>> t >> drop(t.a) >> show() shape: (4, 1) ┌───────┐ │ b │ @@ -410,7 +410,7 @@ def rename(table: Table, name_map: dict[str, str]) -> Pipeable: Renaming one column: >>> t = pdt.Table({"a": [3, 2, 6, 4], "b": ["lll", "g", "u0", "__**_"]}) - >>> t >> rename({"a": "h"}) >> export(Polars()) + >>> t >> rename({"a": "h"}) >> show() shape: (4, 2) ┌─────┬───────┐ │ h ┆ b │ @@ -446,7 +446,7 @@ def rename(table: Table, name_map: dict[str, str]) -> Pipeable: column ``C.a``, however, refers to the column with name *a* in the *current* table. - >>> s >> mutate(u=t.a, v=C.a) >> export(Polars()) + >>> s >> mutate(u=t.a, v=C.a) >> show() shape: (4, 4) ┌─────┬───────┬─────┬───────┐ │ b ┆ a ┆ u ┆ v │ @@ -497,7 +497,7 @@ def mutate(table: Table, **kwargs: ColExpr) -> Pipeable: >>> t1 = pdt.Table( ... dict(a=[3, 1, 4, 1, 5, 9], b=[2.465, 0.22, -4.477, 10.8, -81.2, 0.0]) ... ) - >>> t1 >> mutate(u=t1.a * t1.b) >> export(Polars()) + >>> t1 >> mutate(u=t1.a * t1.b) >> show() shape: (6, 3) ┌─────┬────────┬─────────┐ │ a ┆ b ┆ u │ @@ -549,7 +549,7 @@ def filter(table: Table, *predicates: ColExpr[Bool]) -> Pipeable: Examples -------- >>> t = pdt.Table({"a": [3, 2, 6, 4], "b": ["lll", "g", "u0", "__**_"]}) - >>> t >> filter(t.a <= 4, ~t.b.str.contains("_")) >> export(Polars()) + >>> t >> filter(t.a <= 4, ~t.b.str.contains("_")) >> show() shape: (2, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -613,7 +613,7 @@ def arrange(table: Table, *order_by: ColExpr) -> Pipeable: ... "p": [0.655, -4.33, None, 143.6, 0.0, 1.0, 4.5], ... } ... ) - >>> t >> arrange(t.r.nulls_first(), t.p) >> export(Polars()) + >>> t >> arrange(t.r.nulls_first(), t.p) >> show() shape: (7, 3) ┌──────┬─────┬───────┐ │ r ┆ s ┆ p │ @@ -628,7 +628,7 @@ def arrange(table: Table, *order_by: ColExpr) -> Pipeable: │ 6 ┆ s ┆ 0.0 │ │ 7 ┆ o ┆ -4.33 │ └──────┴─────┴───────┘ - >>> t >> arrange(t.p.nulls_last().descending(), t.s) >> export(Polars()) + >>> t >> arrange(t.p.nulls_last().descending(), t.s) >> show() shape: (7, 3) ┌──────┬─────┬───────┐ │ r ┆ s ┆ p │ @@ -732,7 +732,7 @@ def ungroup(table: Table) -> Pipeable: ... v=t.d.mean(filter=t.a >= 0), ... ) ... >> ungroup() - ... >> export(Polars()) + ... >> show() ... ) shape: (6, 6) ┌───────┬───────────┬───────┬─────┬────────┬─────┐ @@ -793,7 +793,7 @@ def summarize(table: Table, **kwargs: ColExpr) -> Pipeable: ... u=t.b.str.len().mean(), ... v=t.a.sum(filter=t.a >= 0), ... ) - ... >> export(Polars()) + ... >> show() ... ) shape: (3, 3) ┌───────┬──────────┬───────┐ @@ -874,7 +874,7 @@ def slice_head(table: Table, n: int, *, offset: int = 0) -> Pipeable: ... "b": ["l", "r", "srq", "---", " "], ... } ... ) - >>> t >> slice_head(3, offset=1) >> export(Polars()) + >>> t >> slice_head(3, offset=1) >> show() shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -976,7 +976,7 @@ def join( -------- >>> t1 = pdt.Table({"a": [3, 1, 4, 1, 5, 9, 4]}, name="t1") >>> t2 = pdt.Table({"a": [4, 4, 1, 7], "b": ["f", "g", "h", "i"]}, name="t2") - >>> t1 >> join(t2, t1.a == t2.a, how="left") >> export(Polars()) + >>> t1 >> join(t2, t1.a == t2.a, how="left") >> show() shape: (9, 3) ┌─────┬──────┬──────┐ │ a ┆ a_t2 ┆ b_t2 │ diff --git a/src/pydiverse/transform/_internal/tree/col_expr.py b/src/pydiverse/transform/_internal/tree/col_expr.py index 42fecb7..5fa1c96 100644 --- a/src/pydiverse/transform/_internal/tree/col_expr.py +++ b/src/pydiverse/transform/_internal/tree/col_expr.py @@ -99,6 +99,48 @@ def ftype(self, *, agg_is_window: bool) -> Ftype: def map( self, mapping: dict[tuple | ColExpr, ColExpr], *, default: ColExpr | None = None ) -> CaseExpr: + """ + Replaces given values by other expressions. + + :param mapping: + A dictionary of expressions / tuples of expressions to expressions. The + input is compared against key of the dictionary, and if it matches, the + corresponding value of the key is inserted. If the key is a tuple, the input + is compared against each element of the tuple and required to equal at least + one of them. + + :param default: + The value to insert if the input matches none of the keys of `mapping`. + + Note + ---- + If there are multiple columns in the key which have the same value at some row, + any of the corresponding values may be inserted (i.e. ensuring uniqueness of the + keys is your responsibility). + + Example + ------- + >>> t = pdt.Table( + ... { + ... "a": [4, 3, -35, 24, 105], + ... "b": [4, 4, 0, -23, 42], + ... } + ... ) + >>> t >> mutate(c=t.a.is_in(t.b, 24)) >> show() + Table , backend: PolarsImpl + shape: (5, 3) + ┌─────┬─────┬───────┐ + │ a ┆ b ┆ c │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ bool │ + ╞═════╪═════╪═══════╡ + │ 4 ┆ 4 ┆ true │ + │ 3 ┆ 4 ┆ false │ + │ -35 ┆ 0 ┆ false │ + │ 24 ┆ -23 ┆ true │ + │ 105 ┆ 42 ┆ false │ + └─────┴─────┴───────┘ + """ return CaseExpr( ( ( @@ -117,6 +159,80 @@ def map( ) def cast(self, target_type: Dtype) -> Cast: + """ + Cast to a different data type. + + :param target_type: + The type to cast to. + + The following casts are possible: + + .. list-table:: + :header-rows: 1 + + * - Input type + - Target type + - Note + * - Float + - Int8, Int16, Int32, Int64 + - Extracts the integer part (i.e. rounds towards 0). + * - String + - Int8, Int16, Int32, Int64 + - Parses the string as an integer. + * - String + - Float32, Float64 + - Parses the string as a floating point number. + * - Int + - String + - Writes the integer in base 10 as a string. + * - Float + - String + - Writes the floating point number in decimal notation in base 10. + * - Int + - Int8, Int16, Int32, Int64 + - Casts to an integer with a specified number of bits. Behavior is + backend-dependent. + * - Float + - Float32, Float64 + - Casts to a floating point number with a specified number of bits. + Behavior is backend-dependent. + * - Datetime + - Date + - Removes the time component of the Datetime. + * - Datetime + - String + - Writes the datetime in the format YYYY-MM-DD HH:MM:SS.SSSSSS. + Seconds are printed up to microsecond resolution. + * - Date + - String + - Writes the date in the format YYYY-MM-DD. + + + In addition to these casts, there are implicit conversion of integers to + floating point numbers and dates to datetimes. They happens automatically and + do not require an explicit cast. + + Note + ---- + In casts from strings, neither leading nor trailing whitespace is allowed. + + Examples + -------- + >>> t = pdt.Table({"a": [3.5, 10.3, -434.4, -0.2]}, name="T") + >>> t >> mutate(b=t.a.cast(pdt.Int32())) >> show() + Table T, backend: PolarsImpl + shape: (4, 2) + ┌────────┬──────┐ + │ a ┆ b │ + │ --- ┆ --- │ + │ f64 ┆ i32 │ + ╞════════╪══════╡ + │ 3.5 ┆ 3 │ + │ 10.3 ┆ 10 │ + │ -434.4 ┆ -434 │ + │ -0.2 ┆ 0 │ + └────────┴──────┘ + """ return Cast(self, target_type) def iter_children(self) -> Iterable[ColExpr]: @@ -170,7 +286,7 @@ def abs(self: ColExpr[Float]) -> ColExpr[Float]: ... def abs(self: ColExpr[Decimal]) -> ColExpr[Decimal]: ... def abs(self: ColExpr) -> ColExpr: - """""" + """Computes the absolute value.""" return ColFn(ops.abs, self) @@ -192,7 +308,7 @@ def __add__( ) -> ColExpr[Duration]: ... def __add__(self: ColExpr, rhs: ColExpr) -> ColExpr: - """""" + """Addition +""" return ColFn(ops.add, self, rhs) @@ -214,7 +330,7 @@ def __radd__( ) -> ColExpr[Duration]: ... def __radd__(self: ColExpr, rhs: ColExpr) -> ColExpr: - """""" + """Addition +""" return ColFn(ops.add, rhs, self) @@ -224,7 +340,7 @@ def all( partition_by: Col | ColName | Iterable[Col | ColName] | None = None, filter: ColExpr[Bool] | Iterable[ColExpr[Bool]] | None = None, ) -> ColExpr[Bool]: - """""" + """Indicates whether every non-null value in a group is True.""" return ColFn(ops.all, self, partition_by=partition_by, filter=filter) @@ -234,7 +350,7 @@ def any( partition_by: Col | ColName | Iterable[Col | ColName] | None = None, filter: ColExpr[Bool] | Iterable[ColExpr[Bool]] | None = None, ) -> ColExpr[Bool]: - """""" + """Indicates whether at least one value in a group is True.""" return ColFn(ops.any, self, partition_by=partition_by, filter=filter) @@ -249,37 +365,226 @@ def ascending(self: ColExpr) -> ColExpr: return ColFn(ops.ascending, self) def __and__(self: ColExpr[Bool], rhs: ColExpr[Bool]) -> ColExpr[Bool]: - """""" + """ + Boolean AND (__and__) + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [True, True, True, False, False, None], + ... "b": [True, False, None, False, None, None], + ... }, + ... name="bool table", + ... ) + >>> t >> mutate(x=t.a & t.b) >> show() + Table bool table, backend: PolarsImpl + shape: (6, 3) + ┌───────┬───────┬───────┐ + │ a ┆ b ┆ x │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ true │ + │ true ┆ false ┆ false │ + │ true ┆ null ┆ null │ + │ false ┆ false ┆ false │ + │ false ┆ null ┆ false │ + │ null ┆ null ┆ null │ + └───────┴───────┴───────┘ + """ return ColFn(ops.bool_and, self, rhs) def __rand__(self: ColExpr[Bool], rhs: ColExpr[Bool]) -> ColExpr[Bool]: - """""" + """ + Boolean AND (__and__) + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [True, True, True, False, False, None], + ... "b": [True, False, None, False, None, None], + ... }, + ... name="bool table", + ... ) + >>> t >> mutate(x=t.a & t.b) >> show() + Table bool table, backend: PolarsImpl + shape: (6, 3) + ┌───────┬───────┬───────┐ + │ a ┆ b ┆ x │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ true │ + │ true ┆ false ┆ false │ + │ true ┆ null ┆ null │ + │ false ┆ false ┆ false │ + │ false ┆ null ┆ false │ + │ null ┆ null ┆ null │ + └───────┴───────┴───────┘ + """ return ColFn(ops.bool_and, rhs, self) def __invert__(self: ColExpr[Bool]) -> ColExpr[Bool]: - """""" + """ + Boolean inversion (__invert__) + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [True, True, True, False, False, None], + ... "b": [True, False, None, False, None, None], + ... }, + ... name="bool table", + ... ) + >>> t >> mutate(x=~t.a) >> show() + Table bool table, backend: PolarsImpl + shape: (6, 3) + ┌───────┬───────┬───────┐ + │ a ┆ b ┆ x │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ true ┆ false ┆ false │ + │ true ┆ null ┆ false │ + │ false ┆ false ┆ true │ + │ false ┆ null ┆ true │ + │ null ┆ null ┆ null │ + └───────┴───────┴───────┘ + """ return ColFn(ops.bool_invert, self) def __or__(self: ColExpr[Bool], rhs: ColExpr[Bool]) -> ColExpr[Bool]: - """""" + """ + Boolean OR (__or__) + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [True, True, True, False, False, None], + ... "b": [True, False, None, False, None, None], + ... }, + ... name="bool table", + ... ) + >>> t >> mutate(x=t.a | t.b) >> show() + Table bool table, backend: PolarsImpl + shape: (6, 3) + ┌───────┬───────┬───────┐ + │ a ┆ b ┆ x │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ true ┆ null ┆ true │ + │ false ┆ false ┆ false │ + │ false ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └───────┴───────┴───────┘ + """ return ColFn(ops.bool_or, self, rhs) def __ror__(self: ColExpr[Bool], rhs: ColExpr[Bool]) -> ColExpr[Bool]: - """""" + """ + Boolean OR (__or__) + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [True, True, True, False, False, None], + ... "b": [True, False, None, False, None, None], + ... }, + ... name="bool table", + ... ) + >>> t >> mutate(x=t.a | t.b) >> show() + Table bool table, backend: PolarsImpl + shape: (6, 3) + ┌───────┬───────┬───────┐ + │ a ┆ b ┆ x │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ true │ + │ true ┆ false ┆ true │ + │ true ┆ null ┆ true │ + │ false ┆ false ┆ false │ + │ false ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └───────┴───────┴───────┘ + """ return ColFn(ops.bool_or, rhs, self) def __xor__(self: ColExpr[Bool], rhs: ColExpr[Bool]) -> ColExpr[Bool]: - """""" + """ + Boolean XOR (__xor__) + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [True, True, True, False, False, None], + ... "b": [True, False, None, False, None, None], + ... }, + ... name="bool table", + ... ) + >>> t >> mutate(x=t.a ^ t.b) >> show() + Table bool table, backend: PolarsImpl + shape: (6, 3) + ┌───────┬───────┬───────┐ + │ a ┆ b ┆ x │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ true ┆ false ┆ true │ + │ true ┆ null ┆ null │ + │ false ┆ false ┆ false │ + │ false ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └───────┴───────┴───────┘ + """ return ColFn(ops.bool_xor, self, rhs) def __rxor__(self: ColExpr[Bool], rhs: ColExpr[Bool]) -> ColExpr[Bool]: - """""" + """ + Boolean XOR (__xor__) + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [True, True, True, False, False, None], + ... "b": [True, False, None, False, None, None], + ... }, + ... name="bool table", + ... ) + >>> t >> mutate(x=t.a ^ t.b) >> show() + Table bool table, backend: PolarsImpl + shape: (6, 3) + ┌───────┬───────┬───────┐ + │ a ┆ b ┆ x │ + │ --- ┆ --- ┆ --- │ + │ bool ┆ bool ┆ bool │ + ╞═══════╪═══════╪═══════╡ + │ true ┆ true ┆ false │ + │ true ┆ false ┆ true │ + │ true ┆ null ┆ null │ + │ false ┆ false ┆ false │ + │ false ┆ null ┆ null │ + │ null ┆ null ┆ null │ + └───────┴───────┴───────┘ + """ return ColFn(ops.bool_xor, rhs, self) @@ -290,7 +595,7 @@ def ceil(self: ColExpr[Float]) -> ColExpr[Float]: ... def ceil(self: ColExpr[Decimal]) -> ColExpr[Decimal]: ... def ceil(self: ColExpr) -> ColExpr: - """""" + """Returns the smallest integer greater than or equal to the input.""" return ColFn(ops.ceil, self) @@ -317,17 +622,17 @@ def descending(self: ColExpr) -> ColExpr: return ColFn(ops.descending, self) def __eq__(self: ColExpr, rhs: ColExpr) -> ColExpr[Bool]: - """""" + """Equality comparison ==""" return ColFn(ops.equal, self, rhs) def exp(self: ColExpr[Float]) -> ColExpr[Float]: - """""" + """Computes the exponential function.""" return ColFn(ops.exp, self) def fill_null(self: ColExpr, rhs: ColExpr) -> ColExpr: - """""" + """Replaces every null by the given value.""" return ColFn(ops.fill_null, self, rhs) @@ -338,13 +643,13 @@ def floor(self: ColExpr[Float]) -> ColExpr[Float]: ... def floor(self: ColExpr[Decimal]) -> ColExpr[Decimal]: ... def floor(self: ColExpr) -> ColExpr: - """""" + """Returns the largest integer less than or equal to the input.""" return ColFn(ops.floor, self) def __floordiv__(self: ColExpr[Int], rhs: ColExpr[Int]) -> ColExpr[Int]: """ - Integer division. + Integer division // Warning ------- @@ -365,7 +670,7 @@ def __floordiv__(self: ColExpr[Int], rhs: ColExpr[Int]) -> ColExpr[Int]: ... "b": [7, 7, -7, -7], ... } ... ) - >>> t >> mutate(r=t.a // t.b) >> export(Polars()) + >>> t >> mutate(r=t.a // t.b) >> show() shape: (4, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ r │ @@ -383,7 +688,7 @@ def __floordiv__(self: ColExpr[Int], rhs: ColExpr[Int]) -> ColExpr[Int]: def __rfloordiv__(self: ColExpr[Int], rhs: ColExpr[Int]) -> ColExpr[Int]: """ - Integer division. + Integer division // Warning ------- @@ -404,7 +709,7 @@ def __rfloordiv__(self: ColExpr[Int], rhs: ColExpr[Int]) -> ColExpr[Int]: ... "b": [7, 7, -7, -7], ... } ... ) - >>> t >> mutate(r=t.a // t.b) >> export(Polars()) + >>> t >> mutate(r=t.a // t.b) >> show() shape: (4, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ r │ @@ -439,7 +744,7 @@ def __ge__(self: ColExpr[Datetime], rhs: ColExpr[Datetime]) -> ColExpr[Bool]: .. def __ge__(self: ColExpr[Date], rhs: ColExpr[Date]) -> ColExpr[Bool]: ... def __ge__(self: ColExpr, rhs: ColExpr) -> ColExpr: - """""" + """Greater than or equal to comparison >=""" return ColFn(ops.greater_equal, self, rhs) @@ -462,17 +767,33 @@ def __gt__(self: ColExpr[Datetime], rhs: ColExpr[Datetime]) -> ColExpr[Bool]: .. def __gt__(self: ColExpr[Date], rhs: ColExpr[Date]) -> ColExpr[Bool]: ... def __gt__(self: ColExpr, rhs: ColExpr) -> ColExpr: - """""" + """Greater than comparison >""" return ColFn(ops.greater_than, self, rhs) def is_in(self: ColExpr, *rhs: ColExpr) -> ColExpr[Bool]: - """""" + """ + Whether the value equals one of the given. + + Note + ---- + The expression ``t.c.is_in(a1, a2, ...)`` is equivalent to + ``(t.c == a1) | (t.c == a2) | ...``, so passing null to ``is_in`` will result in + null. To compare for equality with null, use + :doc:`pydiverse.transform.ColExpr.is_null`. + """ return ColFn(ops.is_in, self, *rhs) def is_inf(self: ColExpr[Float]) -> ColExpr[Bool]: - """""" + """ + Whether the number is infinite. + + Note + ---- + This is currently only useful for backends supporting IEEE 754-floats. On + other backends it always returns False. + """ return ColFn(ops.is_inf, self) @@ -492,12 +813,12 @@ def is_not_nan(self: ColExpr[Float]) -> ColExpr[Bool]: return ColFn(ops.is_not_nan, self) def is_not_null(self: ColExpr) -> ColExpr[Bool]: - """""" + """Indicates whether the value is not null.""" return ColFn(ops.is_not_null, self) def is_null(self: ColExpr) -> ColExpr[Bool]: - """""" + """Indicates whether the value is null.""" return ColFn(ops.is_null, self) @@ -520,7 +841,7 @@ def __le__(self: ColExpr[Datetime], rhs: ColExpr[Datetime]) -> ColExpr[Bool]: .. def __le__(self: ColExpr[Date], rhs: ColExpr[Date]) -> ColExpr[Bool]: ... def __le__(self: ColExpr, rhs: ColExpr) -> ColExpr: - """""" + """Less than or equal to comparison <=""" return ColFn(ops.less_equal, self, rhs) @@ -543,14 +864,12 @@ def __lt__(self: ColExpr[Datetime], rhs: ColExpr[Datetime]) -> ColExpr[Bool]: .. def __lt__(self: ColExpr[Date], rhs: ColExpr[Date]) -> ColExpr[Bool]: ... def __lt__(self: ColExpr, rhs: ColExpr) -> ColExpr: - """ - `<` as you know it. - """ + """Less than comparison <""" return ColFn(ops.less_than, self, rhs) def log(self: ColExpr[Float]) -> ColExpr[Float]: - """""" + """Computes the natural logarithm.""" return ColFn(ops.log, self) @@ -608,7 +927,7 @@ def max( partition_by: Col | ColName | Iterable[Col | ColName] | None = None, filter: ColExpr[Bool] | Iterable[ColExpr[Bool]] | None = None, ) -> ColExpr: - """""" + """Computes the maximum value in each group.""" return ColFn(ops.max, self, partition_by=partition_by, filter=filter) @@ -642,7 +961,7 @@ def mean( partition_by: Col | ColName | Iterable[Col | ColName] | None = None, filter: ColExpr[Bool] | Iterable[ColExpr[Bool]] | None = None, ) -> ColExpr: - """""" + """Computes the average value in each group.""" return ColFn(ops.mean, self, partition_by=partition_by, filter=filter) @@ -700,13 +1019,13 @@ def min( partition_by: Col | ColName | Iterable[Col | ColName] | None = None, filter: ColExpr[Bool] | Iterable[ColExpr[Bool]] | None = None, ) -> ColExpr: - """""" + """Computes the minimum value in each group.""" return ColFn(ops.min, self, partition_by=partition_by, filter=filter) def __mod__(self: ColExpr[Int], rhs: ColExpr[Int]) -> ColExpr[Int]: """ - Computes the remainder of integer division. + The remainder of integer division % Warning ------- @@ -728,7 +1047,7 @@ def __mod__(self: ColExpr[Int], rhs: ColExpr[Int]) -> ColExpr[Int]: ... "b": [7, 7, -7, -7], ... } ... ) - >>> t >> mutate(r=t.a % t.b) >> export(Polars()) + >>> t >> mutate(r=t.a % t.b) >> show() shape: (4, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ r │ @@ -746,7 +1065,7 @@ def __mod__(self: ColExpr[Int], rhs: ColExpr[Int]) -> ColExpr[Int]: def __rmod__(self: ColExpr[Int], rhs: ColExpr[Int]) -> ColExpr[Int]: """ - Computes the remainder of integer division. + The remainder of integer division % Warning ------- @@ -768,7 +1087,7 @@ def __rmod__(self: ColExpr[Int], rhs: ColExpr[Int]) -> ColExpr[Int]: ... "b": [7, 7, -7, -7], ... } ... ) - >>> t >> mutate(r=t.a % t.b) >> export(Polars()) + >>> t >> mutate(r=t.a % t.b) >> show() shape: (4, 3) ┌─────┬─────┬─────┐ │ a ┆ b ┆ r │ @@ -794,7 +1113,7 @@ def __mul__(self: ColExpr[Float], rhs: ColExpr[Float]) -> ColExpr[Float]: ... def __mul__(self: ColExpr[Decimal], rhs: ColExpr[Decimal]) -> ColExpr[Decimal]: ... def __mul__(self: ColExpr, rhs: ColExpr) -> ColExpr: - """""" + """Multiplication *""" return ColFn(ops.mul, self, rhs) @@ -808,7 +1127,7 @@ def __rmul__(self: ColExpr[Float], rhs: ColExpr[Float]) -> ColExpr[Float]: ... def __rmul__(self: ColExpr[Decimal], rhs: ColExpr[Decimal]) -> ColExpr[Decimal]: ... def __rmul__(self: ColExpr, rhs: ColExpr) -> ColExpr: - """""" + """Multiplication *""" return ColFn(ops.mul, rhs, self) @@ -822,12 +1141,12 @@ def __neg__(self: ColExpr[Float]) -> ColExpr[Float]: ... def __neg__(self: ColExpr[Decimal]) -> ColExpr[Decimal]: ... def __neg__(self: ColExpr) -> ColExpr: - """""" + """The unary - (negation) operator (__neg__)""" return ColFn(ops.neg, self) def __ne__(self: ColExpr, rhs: ColExpr) -> ColExpr[Bool]: - """""" + """Non-equality comparison !=""" return ColFn(ops.not_equal, self, rhs) @@ -875,7 +1194,7 @@ def __pos__(self: ColExpr[Float]) -> ColExpr[Float]: ... def __pos__(self: ColExpr[Decimal]) -> ColExpr[Decimal]: ... def __pos__(self: ColExpr) -> ColExpr: - """""" + """The unary + operator (__pos__)""" return ColFn(ops.pos, self) @@ -889,7 +1208,14 @@ def __pow__(self: ColExpr[Float], rhs: ColExpr[Float]) -> ColExpr[Float]: ... def __pow__(self: ColExpr[Decimal], rhs: ColExpr[Decimal]) -> ColExpr[Decimal]: ... def __pow__(self: ColExpr, rhs: ColExpr) -> ColExpr: - """""" + """ + Computes the power x ** y. + + Note + ---- + Polars throws on negative exponents in the integer case. A polars error like + `failed to convert X to u32` may be due to negative inputs to this function. + """ return ColFn(ops.pow, self, rhs) @@ -903,7 +1229,14 @@ def __rpow__(self: ColExpr[Float], rhs: ColExpr[Float]) -> ColExpr[Float]: ... def __rpow__(self: ColExpr[Decimal], rhs: ColExpr[Decimal]) -> ColExpr[Decimal]: ... def __rpow__(self: ColExpr, rhs: ColExpr) -> ColExpr: - """""" + """ + Computes the power x ** y. + + Note + ---- + Polars throws on negative exponents in the integer case. A polars error like + `failed to convert X to u32` may be due to negative inputs to this function. + """ return ColFn(ops.pow, rhs, self) @@ -917,7 +1250,12 @@ def round(self: ColExpr[Float], decimals: int = 0) -> ColExpr[Float]: ... def round(self: ColExpr[Decimal], decimals: int = 0) -> ColExpr[Decimal]: ... def round(self: ColExpr, decimals: int = 0) -> ColExpr: - """""" + """ + Rounds to a given number of decimals. + + :param decimals: + The number of decimals to round by. + """ return ColFn(ops.round, self, decimals) @@ -927,9 +1265,50 @@ def shift( fill_value: ColExpr = None, *, partition_by: Col | ColName | Iterable[Col | ColName] | None = None, - arrange: ColExpr | Iterable[ColExpr], + arrange: ColExpr | Iterable[ColExpr] | None = None, ) -> ColExpr: - """""" + """ + Shifts values in the column by an offset. + + :param n: + The number of places to shift by. May be negative. + + :param fill_value: + The value to write to the empty spaces created by the shift. Defaults to + null. + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [5, -1, 435, -34, 8, None, 0], + ... "b": ["r", "True", "??", ". .", "-1/12", "abc", "#"], + ... } + ... ) + >>> ( + ... t + ... >> mutate( + ... x=t.a.shift(2, -40), + ... y=t.b.shift(1, arrange=t.a.nulls_last()), + ... ) + ... >> show() + ... ) + Table , backend: PolarsImpl + shape: (7, 4) + ┌──────┬───────┬─────┬───────┐ + │ a ┆ b ┆ x ┆ y │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ i64 ┆ str ┆ i64 ┆ str │ + ╞══════╪═══════╪═════╪═══════╡ + │ 5 ┆ r ┆ -40 ┆ # │ + │ -1 ┆ True ┆ -40 ┆ . . │ + │ 435 ┆ ?? ┆ 5 ┆ -1/12 │ + │ -34 ┆ . . ┆ -1 ┆ null │ + │ 8 ┆ -1/12 ┆ 435 ┆ r │ + │ null ┆ abc ┆ -34 ┆ ?? │ + │ 0 ┆ # ┆ 8 ┆ True │ + └──────┴───────┴─────┴───────┘ + """ return ColFn( ops.shift, self, n, fill_value, partition_by=partition_by, arrange=arrange @@ -959,7 +1338,7 @@ def __sub__(self: ColExpr[Datetime], rhs: ColExpr[Date]) -> ColExpr[Duration]: . def __sub__(self: ColExpr[Date], rhs: ColExpr[Datetime]) -> ColExpr[Duration]: ... def __sub__(self: ColExpr, rhs: ColExpr) -> ColExpr: - """""" + """Subtraction -""" return ColFn(ops.sub, self, rhs) @@ -987,7 +1366,7 @@ def __rsub__(self: ColExpr[Datetime], rhs: ColExpr[Date]) -> ColExpr[Duration]: def __rsub__(self: ColExpr[Date], rhs: ColExpr[Datetime]) -> ColExpr[Duration]: ... def __rsub__(self: ColExpr, rhs: ColExpr) -> ColExpr: - """""" + """Subtraction -""" return ColFn(ops.sub, rhs, self) @@ -1021,7 +1400,7 @@ def sum( partition_by: Col | ColName | Iterable[Col | ColName] | None = None, filter: ColExpr[Bool] | Iterable[ColExpr[Bool]] | None = None, ) -> ColExpr: - """""" + """Computes the sum of values in each group.""" return ColFn(ops.sum, self, partition_by=partition_by, filter=filter) @@ -1037,7 +1416,7 @@ def __truediv__( ) -> ColExpr[Decimal]: ... def __truediv__(self: ColExpr, rhs: ColExpr) -> ColExpr: - """""" + """True division /""" return ColFn(ops.truediv, self, rhs) @@ -1053,7 +1432,7 @@ def __rtruediv__( ) -> ColExpr[Decimal]: ... def __rtruediv__(self: ColExpr, rhs: ColExpr) -> ColExpr: - """""" + """True division /""" return ColFn(ops.truediv, rhs, self) @@ -1071,46 +1450,313 @@ class FnNamespace: @dataclasses.dataclass(slots=True) class StrNamespace(FnNamespace): def contains(self: ColExpr[String], substr: str) -> ColExpr[Bool]: - """""" + """ + Whether the string contains a given substring. + + :param substr: + The substring to look for. + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [" BCD ", "-- 00", " A^^u", "-O2", ""], + ... "b": ["12431", "transform", "12__*m", " ", "abbabbabba"], + ... }, + ... name="string table", + ... ) + >>> ( + ... t + ... >> mutate( + ... j=t.a.str.contains(" "), + ... k=t.b.str.contains("a"), + ... l=t.b.str.contains(""), + ... ) + ... >> show() + ... ) + Table string table, backend: PolarsImpl + shape: (5, 5) + ┌────────┬────────────┬───────┬───────┬──────┐ + │ a ┆ b ┆ j ┆ k ┆ l │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ bool ┆ bool ┆ bool │ + ╞════════╪════════════╪═══════╪═══════╪══════╡ + │ BCD ┆ 12431 ┆ true ┆ false ┆ true │ + │ -- 00 ┆ transform ┆ true ┆ true ┆ true │ + │ A^^u ┆ 12__*m ┆ true ┆ false ┆ true │ + │ -O2 ┆ ┆ false ┆ false ┆ true │ + │ ┆ abbabbabba ┆ false ┆ true ┆ true │ + └────────┴────────────┴───────┴───────┴──────┘ + """ return ColFn(ops.str_contains, self.arg, substr) def ends_with(self: ColExpr[String], suffix: str) -> ColExpr[Bool]: - """""" + """ + Whether the string ends with a given suffix. + + :param suffix: + The suffix to check. + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [" BCD ", "-- 00", " A^^u", "-O2", ""], + ... "b": ["12431", "transform", "12__*m", " ", "abbabbabba"], + ... }, + ... name="string table", + ... ) + >>> ( + ... t + ... >> mutate( + ... j=t.a.str.ends_with(""), + ... k=t.b.str.ends_with("m"), + ... l=t.a.str.ends_with("^u"), + ... ) + ... >> show() + ... ) + Table string table, backend: PolarsImpl + shape: (5, 5) + ┌────────┬────────────┬──────┬───────┬───────┐ + │ a ┆ b ┆ j ┆ k ┆ l │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ bool ┆ bool ┆ bool │ + ╞════════╪════════════╪══════╪═══════╪═══════╡ + │ BCD ┆ 12431 ┆ true ┆ false ┆ false │ + │ -- 00 ┆ transform ┆ true ┆ true ┆ false │ + │ A^^u ┆ 12__*m ┆ true ┆ true ┆ true │ + │ -O2 ┆ ┆ true ┆ false ┆ false │ + │ ┆ abbabbabba ┆ true ┆ false ┆ false │ + └────────┴────────────┴──────┴───────┴───────┘ + """ return ColFn(ops.str_ends_with, self.arg, suffix) def len(self: ColExpr[String]) -> ColExpr[Int]: - """""" + """ + Computes the length of the string. + + Leading and trailing whitespace is included in the length. + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [" BCD ", "-- 00", " A^^u", "-O2"], + ... "b": ["12431", "transform", "12__*m", " "], + ... }, + ... name="string table", + ... ) + >>> t >> mutate(j=t.a.str.len(), k=t.b.str.len()) >> show() + Table string table, backend: PolarsImpl + shape: (4, 4) + ┌────────┬───────────┬─────┬─────┐ + │ a ┆ b ┆ j ┆ k │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ i64 ┆ i64 │ + ╞════════╪═══════════╪═════╪═════╡ + │ BCD ┆ 12431 ┆ 6 ┆ 5 │ + │ -- 00 ┆ transform ┆ 5 ┆ 9 │ + │ A^^u ┆ 12__*m ┆ 5 ┆ 6 │ + │ -O2 ┆ ┆ 3 ┆ 3 │ + └────────┴───────────┴─────┴─────┘ + """ return ColFn(ops.str_len, self.arg) def lower(self: ColExpr[String]) -> ColExpr[String]: - """""" + """ + Converts all alphabet letters to lower case. + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [" BCD ", "-- 00", " A^^u", "-O2"], + ... "b": ["12431", "transform", "12__*m", " "], + ... }, + ... name="string table", + ... ) + >>> t >> mutate(j=t.a.str.lower(), k=t.b.str.lower()) >> show() + Table string table, backend: PolarsImpl + shape: (4, 4) + ┌────────┬───────────┬────────┬───────────┐ + │ a ┆ b ┆ j ┆ k │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞════════╪═══════════╪════════╪═══════════╡ + │ BCD ┆ 12431 ┆ bcd ┆ 12431 │ + │ -- 00 ┆ transform ┆ -- 00 ┆ transform │ + │ A^^u ┆ 12__*m ┆ a^^u ┆ 12__*m │ + │ -O2 ┆ ┆ -o2 ┆ │ + └────────┴───────────┴────────┴───────────┘ + """ return ColFn(ops.str_lower, self.arg) def replace_all( self: ColExpr[String], substr: str, replacement: str ) -> ColExpr[String]: - """""" + """ + Replaces all occurrences of a given substring by a different string. + + :param substr: + The string to replace. + + :param replacement: + The replacement string. + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [" BCD ", "-- 00", " A^^u", "-O2", ""], + ... "b": ["12431", "transform", "12__*m", " ", "abbabbabba"], + ... }, + ... name="string table", + ... ) + >>> ( + ... t + ... >> mutate( + ... r=t.a.str.replace_all("-", "?"), + ... s=t.b.str.replace_all("ansf", "[---]"), + ... u=t.b.str.replace_all("abba", "#"), + ... ) + ... >> show() + ... ) + Table string table, backend: PolarsImpl + shape: (5, 5) + ┌────────┬────────────┬────────┬────────────┬───────────┐ + │ a ┆ b ┆ r ┆ s ┆ u │ + │ --- ┆ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str ┆ str │ + ╞════════╪════════════╪════════╪════════════╪═══════════╡ + │ BCD ┆ 12431 ┆ BCD ┆ 12431 ┆ 12431 │ + │ -- 00 ┆ transform ┆ ?? 00 ┆ tr[---]orm ┆ transform │ + │ A^^u ┆ 12__*m ┆ A^^u ┆ 12__*m ┆ 12__*m │ + │ -O2 ┆ ┆ ?O2 ┆ ┆ │ + │ ┆ abbabbabba ┆ ┆ abbabbabba ┆ #bb# │ + └────────┴────────────┴────────┴────────────┴───────────┘ + """ return ColFn(ops.str_replace_all, self.arg, substr, replacement) def slice( self: ColExpr[String], offset: ColExpr[Int], n: ColExpr[Int] ) -> ColExpr[String]: - """""" + """ + Returns a substring of the input string. + + :param offset: + The 0-based index of the first character included in the result. + + :param n: + The number of characters to include. If the string is shorter than *offset* + + *n*, the result only includes as many characters as there are. + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [" BCD ", "-- 00", " A^^u", "-O2", ""], + ... "b": ["12431", "transform", "12__*m", " ", "abbabbabba"], + ... }, + ... name="string table", + ... ) + >>> ( + ... t + ... >> mutate( + ... j=t.a.str.slice(0, 2), + ... k=t.b.str.slice(4, 10), + ... ) + ... >> show() + ... ) + Table string table, backend: PolarsImpl + shape: (5, 4) + ┌────────┬────────────┬─────┬────────┐ + │ a ┆ b ┆ j ┆ k │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞════════╪════════════╪═════╪════════╡ + │ BCD ┆ 12431 ┆ ┆ 1 │ + │ -- 00 ┆ transform ┆ -- ┆ sform │ + │ A^^u ┆ 12__*m ┆ A ┆ *m │ + │ -O2 ┆ ┆ -O ┆ │ + │ ┆ abbabbabba ┆ ┆ bbabba │ + └────────┴────────────┴─────┴────────┘ + """ return ColFn(ops.str_slice, self.arg, offset, n) def starts_with(self: ColExpr[String], prefix: str) -> ColExpr[Bool]: - """""" + """ + Whether the string starts with a given prefix. + + :param prefix: + The prefix to check. + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [" BCD ", "-- 00", " A^^u", "-O2", ""], + ... "b": ["12431", "transform", "12__*m", " ", "abbabbabba"], + ... }, + ... name="string table", + ... ) + >>> ( + ... t + ... >> mutate( + ... j=t.a.str.starts_with("-"), + ... k=t.b.str.starts_with("12"), + ... ) + ... >> show() + ... ) + Table string table, backend: PolarsImpl + shape: (5, 4) + ┌────────┬────────────┬───────┬───────┐ + │ a ┆ b ┆ j ┆ k │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ bool ┆ bool │ + ╞════════╪════════════╪═══════╪═══════╡ + │ BCD ┆ 12431 ┆ false ┆ true │ + │ -- 00 ┆ transform ┆ true ┆ false │ + │ A^^u ┆ 12__*m ┆ false ┆ true │ + │ -O2 ┆ ┆ true ┆ false │ + │ ┆ abbabbabba ┆ false ┆ false │ + └────────┴────────────┴───────┴───────┘ + """ return ColFn(ops.str_starts_with, self.arg, prefix) def strip(self: ColExpr[String]) -> ColExpr[String]: - """""" + """ + Removes leading and trailing whitespace. + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [" BCD ", "-- 00", " A^^u", "-O2"], + ... "b": ["12431", "transform", "12__*m", " "], + ... }, + ... name="string table", + ... ) + >>> t >> mutate(j=t.a.str.strip(), k=t.b.str.strip()) >> show() + Table string table, backend: PolarsImpl + shape: (4, 4) + ┌────────┬───────────┬───────┬───────────┐ + │ a ┆ b ┆ j ┆ k │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞════════╪═══════════╪═══════╪═══════════╡ + │ BCD ┆ 12431 ┆ BCD ┆ 12431 │ + │ -- 00 ┆ transform ┆ -- 00 ┆ transform │ + │ A^^u ┆ 12__*m ┆ A^^u ┆ 12__*m │ + │ -O2 ┆ ┆ -O2 ┆ │ + └────────┴───────────┴───────┴───────────┘ + """ return ColFn(ops.str_strip, self.arg) @@ -1125,7 +1771,32 @@ def to_datetime(self: ColExpr[String]) -> ColExpr[Datetime]: return ColFn(ops.str_to_datetime, self.arg) def upper(self: ColExpr[String]) -> ColExpr[String]: - """""" + """ + Converts all alphabet letters to upper case. + + Examples + -------- + >>> t = pdt.Table( + ... { + ... "a": [" BCD ", "-- 00", " A^^u", "-O2"], + ... "b": ["12431", "transform", "12__*m", " "], + ... }, + ... name="string table", + ... ) + >>> t >> mutate(j=t.a.str.upper(), k=t.b.str.upper()) >> show() + Table string table, backend: PolarsImpl + shape: (4, 4) + ┌────────┬───────────┬────────┬───────────┐ + │ a ┆ b ┆ j ┆ k │ + │ --- ┆ --- ┆ --- ┆ --- │ + │ str ┆ str ┆ str ┆ str │ + ╞════════╪═══════════╪════════╪═══════════╡ + │ BCD ┆ 12431 ┆ BCD ┆ 12431 │ + │ -- 00 ┆ transform ┆ -- 00 ┆ TRANSFORM │ + │ A^^u ┆ 12__*m ┆ A^^U ┆ 12__*M │ + │ -O2 ┆ ┆ -O2 ┆ │ + └────────┴───────────┴────────┴───────────┘ + """ return ColFn(ops.str_upper, self.arg) @@ -1140,7 +1811,7 @@ def day(self: ColExpr[Date]) -> ColExpr[Int]: ... def day(self: ColExpr[Datetime]) -> ColExpr[Int]: ... def day(self: ColExpr) -> ColExpr: - """""" + """Extracts the day component.""" return ColFn(ops.dt_day, self.arg) @@ -1151,7 +1822,11 @@ def day_of_week(self: ColExpr[Date]) -> ColExpr[Int]: ... def day_of_week(self: ColExpr[Datetime]) -> ColExpr[Int]: ... def day_of_week(self: ColExpr) -> ColExpr: - """""" + """ + The number of the current weekday. + + This is one-based, so Monday is 1 and Sunday is 7. + """ return ColFn(ops.dt_day_of_week, self.arg) @@ -1162,27 +1837,31 @@ def day_of_year(self: ColExpr[Date]) -> ColExpr[Int]: ... def day_of_year(self: ColExpr[Datetime]) -> ColExpr[Int]: ... def day_of_year(self: ColExpr) -> ColExpr: - """""" + """ + The number of days since the beginning of the year. + + This is one-based, so it returns 1 for the 1st of January. + """ return ColFn(ops.dt_day_of_year, self.arg) def hour(self: ColExpr[Datetime]) -> ColExpr[Int]: - """""" + """Extracts the hour component.""" return ColFn(ops.dt_hour, self.arg) def microsecond(self: ColExpr[Datetime]) -> ColExpr[Int]: - """""" + """Extracts the microsecond component.""" return ColFn(ops.dt_microsecond, self.arg) def millisecond(self: ColExpr[Datetime]) -> ColExpr[Int]: - """""" + """Extracts the millisecond component.""" return ColFn(ops.dt_millisecond, self.arg) def minute(self: ColExpr[Datetime]) -> ColExpr[Int]: - """""" + """Extracts the minute component.""" return ColFn(ops.dt_minute, self.arg) @@ -1193,12 +1872,12 @@ def month(self: ColExpr[Date]) -> ColExpr[Int]: ... def month(self: ColExpr[Datetime]) -> ColExpr[Int]: ... def month(self: ColExpr) -> ColExpr: - """""" + """Extracts the month component.""" return ColFn(ops.dt_month, self.arg) def second(self: ColExpr[Datetime]) -> ColExpr[Int]: - """""" + """Extracts the second component.""" return ColFn(ops.dt_second, self.arg) @@ -1209,7 +1888,7 @@ def year(self: ColExpr[Date]) -> ColExpr[Int]: ... def year(self: ColExpr[Datetime]) -> ColExpr[Int]: ... def year(self: ColExpr) -> ColExpr: - """""" + """Extracts the year component.""" return ColFn(ops.dt_year, self.arg) @@ -1295,7 +1974,7 @@ def export(self, target: Target) -> Any: Examples -------- >>> t1 = pdt.Table({"h": [2.465, 0.22, -4.477, 10.8, -81.2, 0.0]}) - >>> t1.h.export(Polars()) + >>> t1.h.show() shape: (6,) Series: 'h' [f64] [ @@ -1769,14 +2448,18 @@ def wrap_literal(expr: Any, *, allow_markers=False) -> Any: or ( # markers can only be at the top of an expression tree not isinstance(expr.op, Marker) - and any( - isinstance(arg, ColFn) and isinstance(arg.op, Marker) - for arg in expr.args + and ( + marker_args := [ + arg + for arg in expr.args + if isinstance(arg, ColFn) and isinstance(arg.op, Marker) + ] ) ) ): + marker = expr.op if isinstance(expr.op, Marker) else marker_args[0].op raise TypeError( - f"invalid usage of `{expr.op.name}` in a column expression.\n" + f"invalid usage of `{marker.name}` in a column expression.\n" "note: This marker function can only be used in arguments to the " "`arrange` verb or the `arrange=` keyword argument to window " "functions. Furthermore, all markers have to be at the top of the " diff --git a/src/pydiverse/transform/types.py b/src/pydiverse/transform/types.py index 8f2383c..cdc0a26 100644 --- a/src/pydiverse/transform/types.py +++ b/src/pydiverse/transform/types.py @@ -40,4 +40,5 @@ "Uint16", "Uint32", "Uint64", + "Dtype", ] diff --git a/tests/test_backend_equivalence/conftest.py b/tests/test_backend_equivalence/conftest.py index 0e44f3c..983fa1f 100644 --- a/tests/test_backend_equivalence/conftest.py +++ b/tests/test_backend_equivalence/conftest.py @@ -99,6 +99,20 @@ "-100110", " -56 ", ], + "e": [ + "abbabbabbabba", + "------", + "012", + "", + "", + None, + None, + " AbAbA..", + "-354.2", + "??", + "11", + "$&/)", + ], } ), "df_datetime": pl.DataFrame( @@ -200,6 +214,12 @@ "null_s": [0] + [None] * 7, } ), + "df_bool": pl.DataFrame( + { + "a": [True, True, True, False, False, None], + "b": [True, False, None, False, None, None], + }, + ), } # compare one dataframe and one SQL backend to all others diff --git a/tests/test_backend_equivalence/test_ops/test_ops_bool.py b/tests/test_backend_equivalence/test_ops/test_ops_bool.py new file mode 100644 index 0000000..f82f85d --- /dev/null +++ b/tests/test_backend_equivalence/test_ops/test_ops_bool.py @@ -0,0 +1,20 @@ +from __future__ import annotations + +from pydiverse.transform.extended import * +from tests.util.assertion import assert_result_equal + + +def test_or(df_bool): + assert_result_equal(df_bool, lambda t: t >> mutate(y=t.a | t.b)) + + +def test_and(df_bool): + assert_result_equal(df_bool, lambda t: t >> mutate(y=t.a & t.b)) + + +def test_xor(df_bool): + assert_result_equal(df_bool, lambda t: t >> mutate(y=t.a ^ t.b)) + + +def test_invert(df_bool): + assert_result_equal(df_bool, lambda t: t >> mutate(y=~t.a)) diff --git a/tests/test_backend_equivalence/test_ops/test_ops_numerical.py b/tests/test_backend_equivalence/test_ops/test_ops_numerical.py index ea3594f..6af0184 100644 --- a/tests/test_backend_equivalence/test_ops/test_ops_numerical.py +++ b/tests/test_backend_equivalence/test_ops/test_ops_numerical.py @@ -160,3 +160,9 @@ def test_is_nan(df_num): **{c.name + "is_not_nan": c.is_not_nan() for c in t}, ), ) + + +def test_int_pow(df_int): + assert_result_equal( + df_int, lambda t: t >> mutate(u=pdt.min(t.a, 10) ** pdt.min(t.b.abs(), 5)) + ) diff --git a/tests/test_backend_equivalence/test_ops/test_ops_string.py b/tests/test_backend_equivalence/test_ops/test_ops_string.py index 96999af..97bb7f7 100644 --- a/tests/test_backend_equivalence/test_ops/test_ops_string.py +++ b/tests/test_backend_equivalence/test_ops/test_ops_string.py @@ -99,6 +99,8 @@ def test_replace_all(df_strings): >> mutate( x=C.col1.str.replace_all(" ", "").str.replace_all("foo", "fOO"), y=C.col2.str.replace_all("Ab", "ab"), + z=C.e.str.replace_all("abba", "#"), + q=C.e.str.replace_all("--", "="), ), )