From 3edb82b8e0c3d7f8cd1894f999bf0695deb1a4ff Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 11 Aug 2023 17:47:55 +0200 Subject: [PATCH] Revert Cython upgrade (#54497) * Revert "CLN: Cython 3 cleanups (#54482)" This reverts commit a936863759b56f4452d20eaf195404044ec97e5b. * Revert "DEPS: Bump cython 3.0 (#54335)" This reverts commit 4cf63eaefe5ba24acb8d12f67324526447ce0391. --- asv_bench/asv.conf.json | 2 +- ci/deps/actions-310.yaml | 2 +- ci/deps/actions-311-downstream_compat.yaml | 2 +- ci/deps/actions-311-pyarrownightly.yaml | 2 +- ci/deps/actions-311.yaml | 2 +- ci/deps/actions-39-minimum_versions.yaml | 2 +- ci/deps/actions-39.yaml | 2 +- ci/deps/actions-pypy-39.yaml | 2 +- ci/deps/circle-310-arm64.yaml | 2 +- doc/source/whatsnew/v2.1.0.rst | 1 - environment.yml | 2 +- pandas/_libs/algos.pyx | 117 +++++++++++- pandas/_libs/arrays.pyi | 2 +- pandas/_libs/arrays.pyx | 3 +- pandas/_libs/groupby.pyi | 3 +- pandas/_libs/groupby.pyx | 203 +++++++++++++-------- pandas/_libs/hashtable.pyi | 9 +- pandas/_libs/hashtable_class_helper.pxi.in | 6 +- pandas/_libs/internals.pyx | 6 +- pandas/_libs/interval.pyx | 15 ++ pandas/_libs/lib.pyi | 22 +-- pandas/_libs/lib.pyx | 3 +- pandas/_libs/ops.pyi | 4 +- pandas/_libs/parsers.pyx | 7 +- pandas/_libs/sparse.pyi | 4 - pandas/_libs/tslibs/conversion.pyi | 1 - pandas/_libs/tslibs/conversion.pyx | 2 +- pandas/_libs/tslibs/dtypes.pyi | 4 +- pandas/_libs/tslibs/nattype.pyx | 30 ++- pandas/_libs/tslibs/np_datetime.pxd | 21 ++- pandas/_libs/tslibs/np_datetime.pyi | 2 +- pandas/_libs/tslibs/np_datetime.pyx | 3 - pandas/_libs/tslibs/offsets.pyi | 5 +- pandas/_libs/tslibs/offsets.pyx | 24 ++- pandas/_libs/tslibs/parsing.pyx | 3 +- pandas/_libs/tslibs/period.pyi | 2 +- pandas/_libs/tslibs/period.pyx | 10 +- pandas/_libs/tslibs/timedeltas.pyi | 8 +- pandas/_libs/tslibs/timedeltas.pyx | 9 +- pandas/_libs/tslibs/timestamps.pyi | 2 +- pandas/_libs/tslibs/timestamps.pyx | 13 ++ pandas/_libs/tslibs/tzconversion.pyi | 2 +- pandas/_libs/tslibs/vectorized.pyi | 2 +- pandas/_libs/window/aggregations.pyi | 4 +- pandas/core/arrays/datetimelike.py | 3 +- pyproject.toml | 2 +- requirements-dev.txt | 2 +- scripts/run_stubtest.py | 2 - setup.py | 2 +- 49 files changed, 412 insertions(+), 171 deletions(-) diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index b7dce4f63f94c..810764754b7e1 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -41,7 +41,7 @@ // pip (with all the conda available packages installed first, // followed by the pip installed packages). "matrix": { - "Cython": ["3.0.0"], + "Cython": ["0.29.33"], "matplotlib": [], "sqlalchemy": [], "scipy": [], diff --git a/ci/deps/actions-310.yaml b/ci/deps/actions-310.yaml index d8186d09bb9d4..ffa7732c604a0 100644 --- a/ci/deps/actions-310.yaml +++ b/ci/deps/actions-310.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=3.0.0 + - cython>=0.29.33 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-311-downstream_compat.yaml b/ci/deps/actions-311-downstream_compat.yaml index a40640e99265a..5a6a26c2e1ad8 100644 --- a/ci/deps/actions-311-downstream_compat.yaml +++ b/ci/deps/actions-311-downstream_compat.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=3.0.0 + - cython>=0.29.33 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-311-pyarrownightly.yaml b/ci/deps/actions-311-pyarrownightly.yaml index 3d5e2e99feb9c..f24e866af0439 100644 --- a/ci/deps/actions-311-pyarrownightly.yaml +++ b/ci/deps/actions-311-pyarrownightly.yaml @@ -7,7 +7,7 @@ dependencies: # build dependencies - versioneer[toml] - meson[ninja]=1.0.1 - - cython>=3.0.0 + - cython>=0.29.33 - meson-python=0.13.1 # test dependencies diff --git a/ci/deps/actions-311.yaml b/ci/deps/actions-311.yaml index 67203201ea637..9d60d734db5b3 100644 --- a/ci/deps/actions-311.yaml +++ b/ci/deps/actions-311.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=3.0.0 + - cython>=0.29.33 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-39-minimum_versions.yaml b/ci/deps/actions-39-minimum_versions.yaml index c1c7b986fe8a4..0e2fcf87c2d6e 100644 --- a/ci/deps/actions-39-minimum_versions.yaml +++ b/ci/deps/actions-39-minimum_versions.yaml @@ -8,7 +8,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=3.0.0 + - cython>=0.29.33 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-39.yaml b/ci/deps/actions-39.yaml index 658dfe032a42b..6ea0d41b947dc 100644 --- a/ci/deps/actions-39.yaml +++ b/ci/deps/actions-39.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=3.0.0 + - cython>=0.29.33 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/ci/deps/actions-pypy-39.yaml b/ci/deps/actions-pypy-39.yaml index 292565e9640d9..035395d55eb3a 100644 --- a/ci/deps/actions-pypy-39.yaml +++ b/ci/deps/actions-pypy-39.yaml @@ -9,7 +9,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=3.0.0 + - cython>=0.29.33 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/ci/deps/circle-310-arm64.yaml b/ci/deps/circle-310-arm64.yaml index 60bdc2b828dae..df4e8e285bd02 100644 --- a/ci/deps/circle-310-arm64.yaml +++ b/ci/deps/circle-310-arm64.yaml @@ -6,7 +6,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython>=3.0.0 + - cython>=0.29.33 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 998efdedb1b57..313bf61ecabf9 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -873,7 +873,6 @@ Other - Bug in :meth:`Series.memory_usage` when ``deep=True`` throw an error with Series of objects and the returned value is incorrect, as it does not take into account GC corrections (:issue:`51858`) - Bug in :meth:`period_range` the default behavior when freq was not passed as an argument was incorrect(:issue:`53687`) - Fixed incorrect ``__name__`` attribute of ``pandas._libs.json`` (:issue:`52898`) -- The minimum version of Cython needed to compile pandas is now ``3.0.0`` (:issue:`54335`) .. --------------------------------------------------------------------------- .. _whatsnew_210.contributors: diff --git a/environment.yml b/environment.yml index 44c0ce37c2957..3a0da0bfc703d 100644 --- a/environment.yml +++ b/environment.yml @@ -8,7 +8,7 @@ dependencies: # build dependencies - versioneer[toml] - - cython=3.0.0 + - cython=0.29.33 - meson[ninja]=1.0.1 - meson-python=0.13.1 diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index ed251c401c277..0b6ea58f987d4 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -998,7 +998,8 @@ def rank_1d( N = len(values) if labels is not None: - assert len(labels) == N + # TODO(cython3): cast won't be necessary (#2992) + assert len(labels) == N out = np.empty(N) grp_sizes = np.ones(N, dtype=np.int64) @@ -1087,7 +1088,8 @@ cdef void rank_sorted_1d( float64_t[::1] out, int64_t[::1] grp_sizes, const intp_t[:] sort_indexer, - const numeric_object_t[:] masked_vals, + # TODO(cython3): make const (https://github.com/cython/cython/issues/3222) + numeric_object_t[:] masked_vals, const uint8_t[:] mask, bint check_mask, Py_ssize_t N, @@ -1142,7 +1144,108 @@ cdef void rank_sorted_1d( # array that we sorted previously, which gives us the location of # that sorted value for retrieval back from the original # values / masked_vals arrays - with gil(numeric_object_t is object): + # TODO(cython3): de-duplicate once cython supports conditional nogil + if numeric_object_t is object: + with gil: + for i in range(N): + at_end = i == N - 1 + + # dups and sum_ranks will be incremented each loop where + # the value / group remains the same, and should be reset + # when either of those change. Used to calculate tiebreakers + dups += 1 + sum_ranks += i - grp_start + 1 + + next_val_diff = at_end or are_diff(masked_vals[sort_indexer[i]], + masked_vals[sort_indexer[i+1]]) + + # We'll need this check later anyway to determine group size, so just + # compute it here since shortcircuiting won't help + group_changed = at_end or (check_labels and + (labels[sort_indexer[i]] + != labels[sort_indexer[i+1]])) + + # Update out only when there is a transition of values or labels. + # When a new value or group is encountered, go back #dups steps( + # the number of occurrence of current value) and assign the ranks + # based on the starting index of the current group (grp_start) + # and the current index + if (next_val_diff or group_changed or (check_mask and + (mask[sort_indexer[i]] + ^ mask[sort_indexer[i+1]]))): + + # If keep_na, check for missing values and assign back + # to the result where appropriate + if keep_na and check_mask and mask[sort_indexer[i]]: + grp_na_count = dups + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = NaN + elif tiebreak == TIEBREAK_AVERAGE: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = sum_ranks / dups + elif tiebreak == TIEBREAK_MIN: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = i - grp_start - dups + 2 + elif tiebreak == TIEBREAK_MAX: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = i - grp_start + 1 + + # With n as the previous rank in the group and m as the number + # of duplicates in this stretch, if TIEBREAK_FIRST and ascending, + # then rankings should be n + 1, n + 2 ... n + m + elif tiebreak == TIEBREAK_FIRST: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = j + 1 - grp_start + + # If TIEBREAK_FIRST and descending, the ranking should be + # n + m, n + (m - 1) ... n + 1. This is equivalent to + # (i - dups + 1) + (i - j + 1) - grp_start + elif tiebreak == TIEBREAK_FIRST_DESCENDING: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = 2 * i - j - dups + 2 - grp_start + elif tiebreak == TIEBREAK_DENSE: + for j in range(i - dups + 1, i + 1): + out[sort_indexer[j]] = grp_vals_seen + + # Look forward to the next value (using the sorting in + # lexsort_indexer). If the value does not equal the current + # value then we need to reset the dups and sum_ranks, knowing + # that a new value is coming up. The conditional also needs + # to handle nan equality and the end of iteration. If group + # changes we do not record seeing a new value in the group + if not group_changed and (next_val_diff or (check_mask and + (mask[sort_indexer[i]] + ^ mask[sort_indexer[i+1]]))): + dups = sum_ranks = 0 + grp_vals_seen += 1 + + # Similar to the previous conditional, check now if we are + # moving to a new group. If so, keep track of the index where + # the new group occurs, so the tiebreaker calculations can + # decrement that from their position. Fill in the size of each + # group encountered (used by pct calculations later). Also be + # sure to reset any of the items helping to calculate dups + if group_changed: + + # If not dense tiebreak, group size used to compute + # percentile will be # of non-null elements in group + if tiebreak != TIEBREAK_DENSE: + grp_size = i - grp_start + 1 - grp_na_count + + # Otherwise, it will be the number of distinct values + # in the group, subtracting 1 if NaNs are present + # since that is a distinct value we shouldn't count + else: + grp_size = grp_vals_seen - (grp_na_count > 0) + + for j in range(grp_start, i + 1): + grp_sizes[sort_indexer[j]] = grp_size + + dups = sum_ranks = 0 + grp_na_count = 0 + grp_start = i + 1 + grp_vals_seen = 1 + else: for i in range(N): at_end = i == N - 1 @@ -1371,9 +1474,8 @@ ctypedef fused out_t: @cython.boundscheck(False) @cython.wraparound(False) def diff_2d( - # TODO: cython bug (post Cython 3) prevents update to "const diff_t[:, :] arr" - ndarray[diff_t, ndim=2] arr, - out_t[:, :] out, + ndarray[diff_t, ndim=2] arr, # TODO(cython3) update to "const diff_t[:, :] arr" + ndarray[out_t, ndim=2] out, Py_ssize_t periods, int axis, bint datetimelike=False, @@ -1381,8 +1483,7 @@ def diff_2d( cdef: Py_ssize_t i, j, sx, sy, start, stop bint f_contig = arr.flags.f_contiguous - # TODO: change to this when arr becomes a memoryview - # bint f_contig = arr.is_f_contig() + # bint f_contig = arr.is_f_contig() # TODO(cython3) diff_t left, right # Disable for unsupported dtype combinations, diff --git a/pandas/_libs/arrays.pyi b/pandas/_libs/arrays.pyi index 86f69c3cdfc75..78fee8f01319c 100644 --- a/pandas/_libs/arrays.pyi +++ b/pandas/_libs/arrays.pyi @@ -26,7 +26,7 @@ class NDArrayBacked: def size(self) -> int: ... @property def nbytes(self) -> int: ... - def copy(self, order=...): ... + def copy(self): ... def delete(self, loc, axis=...): ... def swapaxes(self, axis1, axis2): ... def repeat(self, repeats: int | Sequence[int], axis: int | None = ...): ... diff --git a/pandas/_libs/arrays.pyx b/pandas/_libs/arrays.pyx index 9889436a542c1..718fb358e26bc 100644 --- a/pandas/_libs/arrays.pyx +++ b/pandas/_libs/arrays.pyx @@ -126,7 +126,8 @@ cdef class NDArrayBacked: @property def size(self) -> int: - return self._ndarray.size + # TODO(cython3): use self._ndarray.size + return cnp.PyArray_SIZE(self._ndarray) @property def nbytes(self) -> int: diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 019b30900547d..d165ddd6c8afa 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -44,6 +44,7 @@ def group_fillna_indexer( labels: np.ndarray, # ndarray[int64_t] sorted_labels: npt.NDArray[np.intp], mask: npt.NDArray[np.uint8], + direction: Literal["ffill", "bfill"], limit: int, # int64_t dropna: bool, ) -> None: ... @@ -54,7 +55,7 @@ def group_any_all( mask: np.ndarray, # const uint8_t[::1] val_test: Literal["any", "all"], skipna: bool, - result_mask: np.ndarray | None, + nullable: bool, ) -> None: ... def group_sum( out: np.ndarray, # complexfloatingintuint_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 3384060f74c20..20499016f951e 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -695,8 +695,6 @@ def group_sum( N, K = (values).shape - # TODO: Port this to use conditional nogil - # Note: There are some test failures since the object/non-object paths have diverged if sum_t is object: # NB: this does not use 'compensation' like the non-object track does. for i in range(N): @@ -757,9 +755,9 @@ def group_sum( compensation[lab, j] = 0 sumx[lab, j] = t - _check_below_mincount( - out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx - ) + _check_below_mincount( + out, uses_mask, result_mask, ncounts, K, nobs, min_count, sumx + ) @cython.wraparound(False) @@ -811,9 +809,9 @@ def group_prod( nobs[lab, j] += 1 prodx[lab, j] *= val - _check_below_mincount( - out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx - ) + _check_below_mincount( + out, uses_mask, result_mask, ncounts, K, nobs, min_count, prodx + ) @cython.wraparound(False) @@ -1371,7 +1369,7 @@ cdef numeric_t _get_na_val(numeric_t val, bint is_datetimelike): ctypedef fused mincount_t: - numeric_object_t + numeric_t complex64_t complex128_t @@ -1387,7 +1385,7 @@ cdef inline void _check_below_mincount( int64_t[:, ::1] nobs, int64_t min_count, mincount_t[:, ::1] resx, -) noexcept: +) noexcept nogil: """ Check if the number of observations for a group is below min_count, and if so set the result for that group to the appropriate NA-like value. @@ -1395,49 +1393,48 @@ cdef inline void _check_below_mincount( cdef: Py_ssize_t i, j - with nogil(mincount_t is not object): - for i in range(ncounts): - for j in range(K): + for i in range(ncounts): + for j in range(K): - if nobs[i, j] < min_count: - # if we are integer dtype, not is_datetimelike, and - # not uses_mask, then getting here implies that - # counts[i] < min_count, which means we will - # be cast to float64 and masked at the end - # of WrappedCythonOp._call_cython_op. So we can safely - # set a placeholder value in out[i, j]. - if uses_mask: - result_mask[i, j] = True - # set out[i, j] to 0 to be deterministic, as - # it was initialized with np.empty. Also ensures - # we can downcast out if appropriate. - out[i, j] = 0 - elif ( - mincount_t is float32_t - or mincount_t is float64_t - or mincount_t is complex64_t - or mincount_t is complex128_t - ): - out[i, j] = NAN - elif mincount_t is int64_t: - # Per above, this is a placeholder in - # non-is_datetimelike cases. - out[i, j] = NPY_NAT - elif mincount_t is object: - out[i, j] = None - else: - # placeholder, see above - out[i, j] = 0 + if nobs[i, j] < min_count: + # if we are integer dtype, not is_datetimelike, and + # not uses_mask, then getting here implies that + # counts[i] < min_count, which means we will + # be cast to float64 and masked at the end + # of WrappedCythonOp._call_cython_op. So we can safely + # set a placeholder value in out[i, j]. + if uses_mask: + result_mask[i, j] = True + # set out[i, j] to 0 to be deterministic, as + # it was initialized with np.empty. Also ensures + # we can downcast out if appropriate. + out[i, j] = 0 + elif ( + mincount_t is float32_t + or mincount_t is float64_t + or mincount_t is complex64_t + or mincount_t is complex128_t + ): + out[i, j] = NAN + elif mincount_t is int64_t: + # Per above, this is a placeholder in + # non-is_datetimelike cases. + out[i, j] = NPY_NAT else: - out[i, j] = resx[i, j] + # placeholder, see above + out[i, j] = 0 + else: + out[i, j] = resx[i, j] +# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can +# use `const numeric_object_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) def group_last( numeric_object_t[:, ::1] out, int64_t[::1] counts, - const numeric_object_t[:, :] values, + ndarray[numeric_object_t, ndim=2] values, const intp_t[::1] labels, const uint8_t[:, :] mask, uint8_t[:, ::1] result_mask=None, @@ -1455,7 +1452,9 @@ def group_last( bint uses_mask = mask is not None bint isna_entry - if not len(values) == len(labels): + # TODO(cython3): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") min_count = max(min_count, 1) @@ -1467,7 +1466,8 @@ def group_last( N, K = (values).shape - with nogil(numeric_object_t is not object): + if numeric_object_t is object: + # TODO(cython3): De-duplicate once conditional-nogil is available for i in range(N): lab = labels[i] if lab < 0: @@ -1480,28 +1480,53 @@ def group_last( if uses_mask: isna_entry = mask[i, j] else: - # TODO: just make _treat_as_na support this? - # remove notimplemented for object dtype there - if numeric_object_t is object: - isna_entry = checknull(val) - else: - isna_entry = _treat_as_na(val, is_datetimelike) + isna_entry = checknull(val) if not isna_entry: + # TODO(cython3): use _treat_as_na here once + # conditional-nogil is available. nobs[lab, j] += 1 resx[lab, j] = val - _check_below_mincount( - out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx - ) + for i in range(ncounts): + for j in range(K): + if nobs[i, j] < min_count: + out[i, j] = None + else: + out[i, j] = resx[i, j] + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + + if not isna_entry: + nobs[lab, j] += 1 + resx[lab, j] = val + _check_below_mincount( + out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx + ) + +# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can +# use `const numeric_object_t[:, :] values` @cython.wraparound(False) @cython.boundscheck(False) def group_nth( numeric_object_t[:, ::1] out, int64_t[::1] counts, - const numeric_object_t[:, :] values, + ndarray[numeric_object_t, ndim=2] values, const intp_t[::1] labels, const uint8_t[:, :] mask, uint8_t[:, ::1] result_mask=None, @@ -1520,7 +1545,9 @@ def group_nth( bint uses_mask = mask is not None bint isna_entry - if not len(values) == len(labels): + # TODO(cython3): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") min_count = max(min_count, 1) @@ -1532,7 +1559,8 @@ def group_nth( N, K = (values).shape - with nogil(numeric_object_t is not object): + if numeric_object_t is object: + # TODO(cython3): De-duplicate once conditional-nogil is available for i in range(N): lab = labels[i] if lab < 0: @@ -1545,21 +1573,46 @@ def group_nth( if uses_mask: isna_entry = mask[i, j] else: - # TODO: just make _treat_as_na support this? - # remove notimplemented for object dtype there - if numeric_object_t is object: - isna_entry = checknull(val) - else: - isna_entry = _treat_as_na(val, is_datetimelike) + isna_entry = checknull(val) if not isna_entry: + # TODO(cython3): use _treat_as_na here once + # conditional-nogil is available. nobs[lab, j] += 1 if nobs[lab, j] == rank: resx[lab, j] = val - _check_below_mincount( - out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx - ) + for i in range(ncounts): + for j in range(K): + if nobs[i, j] < min_count: + out[i, j] = None + else: + out[i, j] = resx[i, j] + + else: + with nogil: + for i in range(N): + lab = labels[i] + if lab < 0: + continue + + counts[lab] += 1 + for j in range(K): + val = values[i, j] + + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + + if not isna_entry: + nobs[lab, j] += 1 + if nobs[lab, j] == rank: + resx[lab, j] = val + + _check_below_mincount( + out, uses_mask, result_mask, ncounts, K, nobs, min_count, resx + ) @cython.boundscheck(False) @@ -1651,7 +1704,7 @@ def group_rank( cdef group_min_max( numeric_t[:, ::1] out, int64_t[::1] counts, - const numeric_t[:, :] values, + ndarray[numeric_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, @@ -1699,7 +1752,9 @@ cdef group_min_max( bint uses_mask = mask is not None bint isna_entry - if not len(values) == len(labels): + # TODO(cython3): + # Instead of `labels.shape[0]` use `len(labels)` + if not len(values) == labels.shape[0]: raise AssertionError("len(index) != len(labels)") min_count = max(min_count, 1) @@ -1734,9 +1789,9 @@ cdef group_min_max( if val < group_min_or_max[lab, j]: group_min_or_max[lab, j] = val - _check_below_mincount( - out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max - ) + _check_below_mincount( + out, uses_mask, result_mask, ngroups, K, nobs, min_count, group_min_or_max + ) @cython.wraparound(False) @@ -1744,7 +1799,7 @@ cdef group_min_max( def group_max( numeric_t[:, ::1] out, int64_t[::1] counts, - const numeric_t[:, :] values, + ndarray[numeric_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, @@ -1770,7 +1825,7 @@ def group_max( def group_min( numeric_t[:, ::1] out, int64_t[::1] counts, - const numeric_t[:, :] values, + ndarray[numeric_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, diff --git a/pandas/_libs/hashtable.pyi b/pandas/_libs/hashtable.pyi index 8069637a9bff4..2bc6d74fe6aee 100644 --- a/pandas/_libs/hashtable.pyi +++ b/pandas/_libs/hashtable.pyi @@ -20,6 +20,7 @@ class Factorizer: def factorize( self, values: np.ndarray, + sort: bool = ..., na_sentinel=..., na_value=..., mask=..., @@ -156,9 +157,9 @@ class HashTable: def __contains__(self, key: Hashable) -> bool: ... def sizeof(self, deep: bool = ...) -> int: ... def get_state(self) -> dict[str, int]: ... - # TODO: `val/key` type is subclass-specific - def get_item(self, val): ... # TODO: return type? - def set_item(self, key, val) -> None: ... + # TODO: `item` type is subclass-specific + def get_item(self, item): ... # TODO: return type? + def set_item(self, item, val) -> None: ... def get_na(self): ... # TODO: return type? def set_na(self, val) -> None: ... def map_locations( @@ -184,7 +185,6 @@ class HashTable: self, values: np.ndarray, # np.ndarray[subclass-specific] return_inverse: bool = ..., - mask=..., ) -> ( tuple[ np.ndarray, # np.ndarray[subclass-specific] @@ -198,7 +198,6 @@ class HashTable: na_sentinel: int = ..., na_value: object = ..., mask=..., - ignore_na: bool = True, ) -> tuple[np.ndarray, npt.NDArray[np.intp]]: ... # np.ndarray[subclass-specific] class Complex128HashTable(HashTable): ... diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index c0723392496c1..1cf5d734705af 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -1239,10 +1239,9 @@ cdef class StringHashTable(HashTable): na_value=na_value, ignore_na=ignore_na, return_inverse=True) - # Add unused mask parameter for compat with other signatures def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None, object mask=None): + object na_value=None): # -> np.ndarray[np.intp] _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, @@ -1497,10 +1496,9 @@ cdef class PyObjectHashTable(HashTable): na_value=na_value, ignore_na=ignore_na, return_inverse=True) - # Add unused mask parameter for compat with other signatures def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None, object mask=None): + object na_value=None): # -> np.ndarray[np.intp] _, labels = self._unique(values, uniques, count_prior=count_prior, na_sentinel=na_sentinel, na_value=na_value, diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 83ea99c13b153..adf4e8c926fa3 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -2,10 +2,14 @@ from collections import defaultdict import weakref cimport cython -from cpython.pyport cimport PY_SSIZE_T_MAX from cpython.slice cimport PySlice_GetIndicesEx from cython cimport Py_ssize_t + +cdef extern from "Python.h": + # TODO(cython3): from cpython.pyport cimport PY_SSIZE_T_MAX + Py_ssize_t PY_SSIZE_T_MAX + import numpy as np cimport numpy as cnp diff --git a/pandas/_libs/interval.pyx b/pandas/_libs/interval.pyx index 44f54bb451283..e07d80dd04b31 100644 --- a/pandas/_libs/interval.pyx +++ b/pandas/_libs/interval.pyx @@ -511,6 +511,17 @@ cdef class Interval(IntervalMixin): or is_timedelta64_object(y) ): return Interval(self.left + y, self.right + y, closed=self.closed) + elif ( + # __radd__ pattern + # TODO(cython3): remove this + isinstance(y, Interval) + and ( + isinstance(self, numbers.Number) + or PyDelta_Check(self) + or is_timedelta64_object(self) + ) + ): + return Interval(y.left + self, y.right + self, closed=y.closed) return NotImplemented def __radd__(self, other): @@ -534,6 +545,10 @@ cdef class Interval(IntervalMixin): def __mul__(self, y): if isinstance(y, numbers.Number): return Interval(self.left * y, self.right * y, closed=self.closed) + elif isinstance(y, Interval) and isinstance(self, numbers.Number): + # __radd__ semantics + # TODO(cython3): remove this + return Interval(y.left * self, y.right * self, closed=y.closed) return NotImplemented def __rmul__(self, other): diff --git a/pandas/_libs/lib.pyi b/pandas/_libs/lib.pyi index 7e92032a73325..32641319a6b96 100644 --- a/pandas/_libs/lib.pyi +++ b/pandas/_libs/lib.pyi @@ -45,24 +45,22 @@ def is_scalar(val: object) -> bool: ... def is_list_like(obj: object, allow_sets: bool = ...) -> bool: ... def is_pyarrow_array(obj: object) -> bool: ... def is_period(val: object) -> TypeGuard[Period]: ... -def is_interval(obj: object) -> TypeGuard[Interval]: ... -def is_decimal(obj: object) -> TypeGuard[Decimal]: ... -def is_complex(obj: object) -> TypeGuard[complex]: ... -def is_bool(obj: object) -> TypeGuard[bool | np.bool_]: ... -def is_integer(obj: object) -> TypeGuard[int | np.integer]: ... +def is_interval(val: object) -> TypeGuard[Interval]: ... +def is_decimal(val: object) -> TypeGuard[Decimal]: ... +def is_complex(val: object) -> TypeGuard[complex]: ... +def is_bool(val: object) -> TypeGuard[bool | np.bool_]: ... +def is_integer(val: object) -> TypeGuard[int | np.integer]: ... def is_int_or_none(obj) -> bool: ... -def is_float(obj: object) -> TypeGuard[float]: ... +def is_float(val: object) -> TypeGuard[float]: ... def is_interval_array(values: np.ndarray) -> bool: ... -def is_datetime64_array(values: np.ndarray, skipna: bool = True) -> bool: ... -def is_timedelta_or_timedelta64_array( - values: np.ndarray, skipna: bool = True -) -> bool: ... +def is_datetime64_array(values: np.ndarray) -> bool: ... +def is_timedelta_or_timedelta64_array(values: np.ndarray) -> bool: ... def is_datetime_with_singletz_array(values: np.ndarray) -> bool: ... def is_time_array(values: np.ndarray, skipna: bool = ...): ... def is_date_array(values: np.ndarray, skipna: bool = ...): ... def is_datetime_array(values: np.ndarray, skipna: bool = ...): ... def is_string_array(values: np.ndarray, skipna: bool = ...): ... -def is_float_array(values: np.ndarray): ... +def is_float_array(values: np.ndarray, skipna: bool = ...): ... def is_integer_array(values: np.ndarray, skipna: bool = ...): ... def is_bool_array(values: np.ndarray, skipna: bool = ...): ... def fast_multiget(mapping: dict, keys: np.ndarray, default=...) -> np.ndarray: ... @@ -183,7 +181,7 @@ def count_level_2d( max_bin: int, ) -> np.ndarray: ... # np.ndarray[np.int64, ndim=2] def get_level_sorter( - codes: np.ndarray, # const int64_t[:] + label: np.ndarray, # const int64_t[:] starts: np.ndarray, # const intp_t[:] ) -> np.ndarray: ... # np.ndarray[np.intp, ndim=1] def generate_bins_dt64( diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 38695fbb8222b..55819ebd1f15e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -512,7 +512,8 @@ def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray: @cython.wraparound(False) @cython.boundscheck(False) -def has_infs(const floating[:] arr) -> bool: +# TODO(cython3): Can add const once cython#1772 is resolved +def has_infs(floating[:] arr) -> bool: cdef: Py_ssize_t i, n = len(arr) floating inf, neginf, val diff --git a/pandas/_libs/ops.pyi b/pandas/_libs/ops.pyi index 6738a1dff4a9e..515f7aa53ba15 100644 --- a/pandas/_libs/ops.pyi +++ b/pandas/_libs/ops.pyi @@ -37,8 +37,8 @@ def vec_binop( @overload def maybe_convert_bool( arr: npt.NDArray[np.object_], - true_values: Iterable | None = None, - false_values: Iterable | None = None, + true_values: Iterable = ..., + false_values: Iterable = ..., convert_to_masked_nullable: Literal[False] = ..., ) -> tuple[np.ndarray, None]: ... @overload diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index e447d3b0f5920..6d66e21ce49f5 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -35,7 +35,6 @@ from cpython.unicode cimport ( PyUnicode_AsUTF8String, PyUnicode_Decode, PyUnicode_DecodeUTF8, - PyUnicode_FromString, ) from cython cimport Py_ssize_t from libc.stdlib cimport free @@ -45,6 +44,12 @@ from libc.string cimport ( strncpy, ) + +cdef extern from "Python.h": + # TODO(cython3): get this from cpython.unicode + object PyUnicode_FromString(char *v) + + import numpy as np cimport numpy as cnp diff --git a/pandas/_libs/sparse.pyi b/pandas/_libs/sparse.pyi index 536265b25425e..9e5cecc61e5ca 100644 --- a/pandas/_libs/sparse.pyi +++ b/pandas/_libs/sparse.pyi @@ -39,10 +39,6 @@ class BlockIndex(SparseIndex): self, length: int, blocs: np.ndarray, blengths: np.ndarray ) -> None: ... - # Override to have correct parameters - def intersect(self, other: SparseIndex) -> Self: ... - def make_union(self, y: SparseIndex) -> Self: ... - def make_mask_object_ndarray( arr: npt.NDArray[np.object_], fill_value ) -> npt.NDArray[np.bool_]: ... diff --git a/pandas/_libs/tslibs/conversion.pyi b/pandas/_libs/tslibs/conversion.pyi index 6426e32c52304..d564d767f7f05 100644 --- a/pandas/_libs/tslibs/conversion.pyi +++ b/pandas/_libs/tslibs/conversion.pyi @@ -10,6 +10,5 @@ TD64NS_DTYPE: np.dtype def precision_from_unit( unit: str, - out_reso: int = ..., ) -> tuple[int, int]: ... # (int64_t, _) def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ... diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 2a2a0f347ce12..45c4d7809fe7a 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -5,7 +5,6 @@ from libc.math cimport log10 from numpy cimport ( int32_t, int64_t, - npy_datetime, ) cnp.import_array() @@ -44,6 +43,7 @@ from pandas._libs.tslibs.np_datetime cimport ( get_datetime64_value, get_implementation_bounds, import_pandas_datetime, + npy_datetime, npy_datetimestruct, npy_datetimestruct_to_datetime, pandas_datetime_to_datetimestruct, diff --git a/pandas/_libs/tslibs/dtypes.pyi b/pandas/_libs/tslibs/dtypes.pyi index b0293d2e0fcf2..bea3e18273318 100644 --- a/pandas/_libs/tslibs/dtypes.pyi +++ b/pandas/_libs/tslibs/dtypes.pyi @@ -5,10 +5,10 @@ from enum import Enum _attrname_to_abbrevs: dict[str, str] _period_code_map: dict[str, int] -def periods_per_day(reso: int = ...) -> int: ... +def periods_per_day(reso: int) -> int: ... def periods_per_second(reso: int) -> int: ... def is_supported_unit(reso: int) -> bool: ... -def npy_unit_to_abbrev(unit: int) -> str: ... +def npy_unit_to_abbrev(reso: int) -> str: ... def get_supported_reso(reso: int) -> int: ... def abbrev_to_npy_unit(abbrev: str) -> int: ... diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 04a6858297aee..7d75fa3114d2b 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -128,6 +128,11 @@ cdef class _NaT(datetime): return NotImplemented def __add__(self, other): + if self is not c_NaT: + # TODO(cython3): remove this it moved to __radd__ + # cython __radd__ semantics + self, other = other, self + if PyDateTime_Check(other): return c_NaT elif PyDelta_Check(other): @@ -157,6 +162,15 @@ cdef class _NaT(datetime): def __sub__(self, other): # Duplicate some logic from _Timestamp.__sub__ to avoid needing # to subclass; allows us to @final(_Timestamp.__sub__) + cdef: + bint is_rsub = False + + if self is not c_NaT: + # cython __rsub__ semantics + # TODO(cython3): remove __rsub__ logic from here + self, other = other, self + is_rsub = True + if PyDateTime_Check(other): return c_NaT elif PyDelta_Check(other): @@ -170,9 +184,19 @@ cdef class _NaT(datetime): elif util.is_array(other): if other.dtype.kind == "m": - # NaT - timedelta64 we treat NaT as datetime64, so result - # is datetime64 - result = np.empty(other.shape, dtype="datetime64[ns]") + if not is_rsub: + # NaT - timedelta64 we treat NaT as datetime64, so result + # is datetime64 + result = np.empty(other.shape, dtype="datetime64[ns]") + result.fill("NaT") + return result + + # __rsub__ logic here + # TODO(cython3): remove this, move above code out of + # ``if not is_rsub`` block + # timedelta64 - NaT we have to treat NaT as timedelta64 + # for this to be meaningful, and the result is timedelta64 + result = np.empty(other.shape, dtype="timedelta64[ns]") result.fill("NaT") return result diff --git a/pandas/_libs/tslibs/np_datetime.pxd b/pandas/_libs/tslibs/np_datetime.pxd index bf29184d7a94b..60532174e8bdc 100644 --- a/pandas/_libs/tslibs/np_datetime.pxd +++ b/pandas/_libs/tslibs/np_datetime.pxd @@ -6,22 +6,35 @@ from cpython.datetime cimport ( from numpy cimport ( int32_t, int64_t, - npy_datetime, - npy_timedelta, ) +# TODO(cython3): most of these can be cimported directly from numpy +cdef extern from "numpy/ndarrayobject.h": + ctypedef int64_t npy_timedelta + ctypedef int64_t npy_datetime + cdef extern from "numpy/ndarraytypes.h": ctypedef struct PyArray_DatetimeMetaData: NPY_DATETIMEUNIT base int64_t num +cdef extern from "numpy/arrayscalars.h": + ctypedef struct PyDatetimeScalarObject: + # PyObject_HEAD + npy_datetime obval + PyArray_DatetimeMetaData obmeta + + ctypedef struct PyTimedeltaScalarObject: + # PyObject_HEAD + npy_timedelta obval + PyArray_DatetimeMetaData obmeta + cdef extern from "numpy/ndarraytypes.h": ctypedef struct npy_datetimestruct: int64_t year int32_t month, day, hour, min, sec, us, ps, as - # TODO: Can remove this once NPY_FR_GENERIC is added to - # the Cython __init__.pxd for numpy + ctypedef enum NPY_DATETIMEUNIT: NPY_FR_Y NPY_FR_M diff --git a/pandas/_libs/tslibs/np_datetime.pyi b/pandas/_libs/tslibs/np_datetime.pyi index c42bc43ac9d89..0cb0e3b0237d7 100644 --- a/pandas/_libs/tslibs/np_datetime.pyi +++ b/pandas/_libs/tslibs/np_datetime.pyi @@ -9,7 +9,7 @@ class OutOfBoundsTimedelta(ValueError): ... def py_get_unit_from_dtype(dtype: np.dtype): ... def py_td64_to_tdstruct(td64: int, unit: int) -> dict: ... def astype_overflowsafe( - values: np.ndarray, + arr: np.ndarray, dtype: np.dtype, copy: bool = ..., round_ok: bool = ..., diff --git a/pandas/_libs/tslibs/np_datetime.pyx b/pandas/_libs/tslibs/np_datetime.pyx index 8873695c23381..7b2ee68c73ad2 100644 --- a/pandas/_libs/tslibs/np_datetime.pyx +++ b/pandas/_libs/tslibs/np_datetime.pyx @@ -28,11 +28,8 @@ cimport numpy as cnp cnp.import_array() from numpy cimport ( - PyDatetimeScalarObject, - PyTimedeltaScalarObject, int64_t, ndarray, - npy_datetime, uint8_t, ) diff --git a/pandas/_libs/tslibs/offsets.pyi b/pandas/_libs/tslibs/offsets.pyi index 1d37477573023..1a4742111db89 100644 --- a/pandas/_libs/tslibs/offsets.pyi +++ b/pandas/_libs/tslibs/offsets.pyi @@ -277,10 +277,7 @@ def roll_qtrday( INVALID_FREQ_ERR_MSG: Literal["Invalid frequency: {0}"] def shift_months( - dtindex: npt.NDArray[np.int64], - months: int, - day_opt: str | None = ..., - reso: int = ..., + dtindex: npt.NDArray[np.int64], months: int, day_opt: str | None = ... ) -> npt.NDArray[np.int64]: ... _offset_map: dict[str, BaseOffset] diff --git a/pandas/_libs/tslibs/offsets.pyx b/pandas/_libs/tslibs/offsets.pyx index f330a0cea1917..958fe1181d309 100644 --- a/pandas/_libs/tslibs/offsets.pyx +++ b/pandas/_libs/tslibs/offsets.pyx @@ -479,7 +479,12 @@ cdef class BaseOffset: return type(self)(n=1, normalize=self.normalize, **self.kwds) def __add__(self, other): - if util.is_array(other) and other.dtype == object: + if not isinstance(self, BaseOffset): + # cython semantics; this is __radd__ + # TODO(cython3): remove this, this moved to __radd__ + return other.__add__(self) + + elif util.is_array(other) and other.dtype == object: return np.array([self + x for x in other]) try: @@ -496,6 +501,10 @@ cdef class BaseOffset: elif type(other) is type(self): return type(self)(self.n - other.n, normalize=self.normalize, **self.kwds) + elif not isinstance(self, BaseOffset): + # TODO(cython3): remove, this moved to __rsub__ + # cython semantics, this is __rsub__ + return (-other).__add__(self) else: # e.g. PeriodIndex return NotImplemented @@ -509,6 +518,10 @@ cdef class BaseOffset: elif is_integer_object(other): return type(self)(n=other * self.n, normalize=self.normalize, **self.kwds) + elif not isinstance(self, BaseOffset): + # TODO(cython3): remove this, this moved to __rmul__ + # cython semantics, this is __rmul__ + return other.__mul__(self) return NotImplemented def __rmul__(self, other): @@ -997,6 +1010,10 @@ cdef class Tick(SingleConstructorOffset): return self.delta.__gt__(other) def __mul__(self, other): + if not isinstance(self, Tick): + # TODO(cython3), remove this, this moved to __rmul__ + # cython semantics, this is __rmul__ + return other.__mul__(self) if is_float_object(other): n = other * self.n # If the new `n` is an integer, we can represent it using the @@ -1024,6 +1041,11 @@ cdef class Tick(SingleConstructorOffset): return _wrap_timedelta_result(result) def __add__(self, other): + if not isinstance(self, Tick): + # cython semantics; this is __radd__ + # TODO(cython3): remove this, this moved to __radd__ + return other.__add__(self) + if isinstance(other, Tick): if type(self) is type(other): return type(self)(self.n + other.n) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 5e3ed8d99c659..3643c840a50a6 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -774,7 +774,8 @@ def try_parse_year_month_day( object[::1] result n = len(years) - if len(months) != n or len(days) != n: + # TODO(cython3): Use len instead of `shape[0]` + if months.shape[0] != n or days.shape[0] != n: raise ValueError("Length of years/months/days must all be equal") result = np.empty(n, dtype="O") diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index b3aa6c34e323f..8826757e31c32 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -89,7 +89,7 @@ class Period(PeriodMixin): @classmethod def _from_ordinal(cls, ordinal: int, freq) -> Period: ... @classmethod - def now(cls, freq: BaseOffset) -> Period: ... + def now(cls, freq: BaseOffset = ...) -> Period: ... def strftime(self, fmt: str) -> str: ... def to_timestamp( self, diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index eadb23e0a94ca..c37e9cd7ef1f3 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -1838,6 +1838,10 @@ cdef class _Period(PeriodMixin): def __add__(self, other): if not is_period_object(self): + # cython semantics; this is analogous to a call to __radd__ + # TODO(cython3): remove this + if self is NaT: + return NaT return other.__add__(self) if is_any_td_scalar(other): @@ -1872,6 +1876,10 @@ cdef class _Period(PeriodMixin): def __sub__(self, other): if not is_period_object(self): + # cython semantics; this is like a call to __rsub__ + # TODO(cython3): remove this + if self is NaT: + return NaT return NotImplemented elif ( @@ -2503,7 +2511,7 @@ cdef class _Period(PeriodMixin): object_state = None, self.freq, self.ordinal return (Period, object_state) - def strftime(self, fmt: str | None) -> str: + def strftime(self, fmt: str) -> str: r""" Returns a formatted string representation of the :class:`Period`. diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index da88ad32d625b..aba9b25b23154 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -68,7 +68,7 @@ UnitChoices: TypeAlias = Literal[ _S = TypeVar("_S", bound=timedelta) def ints_to_pytimedelta( - m8values: npt.NDArray[np.timedelta64], + arr: npt.NDArray[np.timedelta64], box: bool = ..., ) -> npt.NDArray[np.object_]: ... def array_to_timedelta64( @@ -162,10 +162,8 @@ class Timedelta(timedelta): def __gt__(self, other: timedelta) -> bool: ... def __hash__(self) -> int: ... def isoformat(self) -> str: ... - def to_numpy( - self, dtype: npt.DTypeLike = ..., copy: bool = False - ) -> np.timedelta64: ... - def view(self, dtype: npt.DTypeLike) -> object: ... + def to_numpy(self) -> np.timedelta64: ... + def view(self, dtype: npt.DTypeLike = ...) -> object: ... @property def unit(self) -> str: ... def as_unit(self, unit: str, round_ok: bool = ...) -> Timedelta: ... diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index d2b57f447c350..ffa9a67542e21 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -1043,9 +1043,8 @@ cdef class _Timedelta(timedelta): """ return npy_unit_to_abbrev(self._creso) - # TODO: make cdef property once this works in Cython @property - def days(self) -> int: + def days(self) -> int: # TODO(cython3): make cdef property """ Returns the days of the timedelta. @@ -1068,9 +1067,8 @@ cdef class _Timedelta(timedelta): self._ensure_components() return self._d - # TODO: make cdef property once this works in Cython @property - def seconds(self) -> int: + def seconds(self) -> int: # TODO(cython3): make cdef property """ Return the total hours, minutes, and seconds of the timedelta as seconds. @@ -1107,9 +1105,8 @@ cdef class _Timedelta(timedelta): self._ensure_components() return self._h * 3600 + self._m * 60 + self._s - # TODO: make cdef property once this works in Cython @property - def microseconds(self) -> int: + def microseconds(self) -> int: # TODO(cython3): make cdef property # NB: using the python C-API PyDateTime_DELTA_GET_MICROSECONDS will fail # (or be incorrect) self._ensure_components() diff --git a/pandas/_libs/tslibs/timestamps.pyi b/pandas/_libs/tslibs/timestamps.pyi index 24c0a07eb7985..36ae2d6d892f1 100644 --- a/pandas/_libs/tslibs/timestamps.pyi +++ b/pandas/_libs/tslibs/timestamps.pyi @@ -180,7 +180,7 @@ class Timestamp(datetime): def is_year_end(self) -> bool: ... def to_pydatetime(self, warn: bool = ...) -> datetime: ... def to_datetime64(self) -> np.datetime64: ... - def to_period(self, freq: BaseOffset | str | None = None) -> Period: ... + def to_period(self, freq: BaseOffset | str = ...) -> Period: ... def to_julian_date(self) -> np.float64: ... @property def asm8(self) -> np.datetime64: ... diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index 536a8372c64a8..844fc8f0ed187 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -476,6 +476,11 @@ cdef class _Timestamp(ABCTimestamp): dtype=object, ) + elif not isinstance(self, _Timestamp): + # cython semantics, args have been switched and this is __radd__ + # TODO(cython3): remove this it moved to __radd__ + return other.__add__(self) + return NotImplemented def __radd__(self, other): @@ -509,10 +514,13 @@ cdef class _Timestamp(ABCTimestamp): and (PyDateTime_Check(other) or is_datetime64_object(other))): # both_timestamps is to determine whether Timedelta(self - other) # should raise the OOB error, or fall back returning a timedelta. + # TODO(cython3): clean out the bits that moved to __rsub__ both_timestamps = (isinstance(other, _Timestamp) and isinstance(self, _Timestamp)) if isinstance(self, _Timestamp): other = type(self)(other) + else: + self = type(other)(self) if (self.tzinfo is None) ^ (other.tzinfo is None): raise TypeError( @@ -542,6 +550,11 @@ cdef class _Timestamp(ABCTimestamp): # We get here in stata tests, fall back to stdlib datetime # method and return stdlib timedelta object pass + elif is_datetime64_object(self): + # GH#28286 cython semantics for __rsub__, `other` is actually + # the Timestamp + # TODO(cython3): remove this, this moved to __rsub__ + return type(other)(self) - other return NotImplemented diff --git a/pandas/_libs/tslibs/tzconversion.pyi b/pandas/_libs/tslibs/tzconversion.pyi index 2108fa0f35547..a354765a348ec 100644 --- a/pandas/_libs/tslibs/tzconversion.pyi +++ b/pandas/_libs/tslibs/tzconversion.pyi @@ -10,7 +10,7 @@ from pandas._typing import npt # tz_convert_from_utc_single exposed for testing def tz_convert_from_utc_single( - utc_val: np.int64, tz: tzinfo, creso: int = ... + val: np.int64, tz: tzinfo, creso: int = ... ) -> np.int64: ... def tz_localize_to_utc( vals: npt.NDArray[np.int64], diff --git a/pandas/_libs/tslibs/vectorized.pyi b/pandas/_libs/tslibs/vectorized.pyi index de19f592da62b..3fd9e2501e611 100644 --- a/pandas/_libs/tslibs/vectorized.pyi +++ b/pandas/_libs/tslibs/vectorized.pyi @@ -31,7 +31,7 @@ def get_resolution( reso: int = ..., # NPY_DATETIMEUNIT ) -> Resolution: ... def ints_to_pydatetime( - stamps: npt.NDArray[np.int64], + arr: npt.NDArray[np.int64], tz: tzinfo | None = ..., box: str = ..., reso: int = ..., # NPY_DATETIMEUNIT diff --git a/pandas/_libs/window/aggregations.pyi b/pandas/_libs/window/aggregations.pyi index a6cfbec9b15b9..b926a7cb73425 100644 --- a/pandas/_libs/window/aggregations.pyi +++ b/pandas/_libs/window/aggregations.pyi @@ -111,8 +111,8 @@ def ewm( com: float, # float64_t adjust: bool, ignore_na: bool, - deltas: np.ndarray | None = None, # const float64_t[:] - normalize: bool = True, + deltas: np.ndarray, # const float64_t[:] + normalize: bool, ) -> np.ndarray: ... # np.ndarray[np.float64] def ewmcov( input_x: np.ndarray, # const float64_t[:] diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 2b43b090a43e0..c3b8d1c0e79e8 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -2260,7 +2260,8 @@ def _concat_same_type( return new_obj def copy(self, order: str = "C") -> Self: - new_obj = super().copy(order=order) + # error: Unexpected keyword argument "order" for "copy" + new_obj = super().copy(order=order) # type: ignore[call-arg] new_obj._freq = self.freq return new_obj diff --git a/pyproject.toml b/pyproject.toml index ae658329f42ee..1034196baa15e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = [ "meson-python==0.13.1", "meson==1.0.1", "wheel", - "Cython>=3.0.0", # Note: sync with setup.py, environment.yml and asv.conf.json + "Cython>=0.29.33,<3", # Note: sync with setup.py, environment.yml and asv.conf.json # Note: numpy 1.25 has a backwards compatible C API by default # we don't want to force users to compile with 1.25 though # (Ideally, in the future, though, oldest-supported-numpy can be dropped when our min numpy is 1.25.x) diff --git a/requirements-dev.txt b/requirements-dev.txt index 4af4351413a5b..0944acbc36c9b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,7 +3,7 @@ pip versioneer[toml] -cython==3.0.0 +cython==0.29.33 meson[ninja]==1.0.1 meson-python==0.13.1 pytest>=7.3.2 diff --git a/scripts/run_stubtest.py b/scripts/run_stubtest.py index 35cbbef08124e..dedcdb5532593 100644 --- a/scripts/run_stubtest.py +++ b/scripts/run_stubtest.py @@ -47,8 +47,6 @@ # stubtest might be too sensitive "pandas._libs.lib.NoDefault", "pandas._libs.lib._NoDefault.no_default", - # stubtest/Cython is not recognizing the default value for the dtype parameter - "pandas._libs.lib.map_infer_mask", # internal type alias (should probably be private) "pandas._libs.lib.ndarray_obj_2d", # runtime argument "owner" has a default value but stub argument does not diff --git a/setup.py b/setup.py index 1ea7a502505b5..663bbd3952eab 100755 --- a/setup.py +++ b/setup.py @@ -37,7 +37,7 @@ def is_platform_mac(): # note: sync with pyproject.toml, environment.yml and asv.conf.json -min_cython_ver = "3.0.0" +min_cython_ver = "0.29.33" try: from Cython import (