From 9db444fcdcee9e20b5f942758db18172ebf97d03 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 21 Apr 2021 19:21:13 +1000 Subject: [PATCH 01/10] Basic type promotion and computation for CSC/ CSR broadcasting * Still has some failing tests * No benchmarks * No tests for unary ops * No support for functions with arity > 2 (what even are these functions) --- sparse/_compressed/elemwise.py | 117 +++++++++++++++++++++++++++++++++ sparse/_umath.py | 88 +++++++++++++++++++++---- sparse/tests/test_elemwise.py | 68 ++++++++++++++++++- 3 files changed, 259 insertions(+), 14 deletions(-) create mode 100644 sparse/_compressed/elemwise.py diff --git a/sparse/_compressed/elemwise.py b/sparse/_compressed/elemwise.py new file mode 100644 index 00000000..73db619d --- /dev/null +++ b/sparse/_compressed/elemwise.py @@ -0,0 +1,117 @@ +from functools import lru_cache +from typing import Callable + +import numpy as np +import scipy.sparse +from numba import njit + + +def op_unary(func, a): + res = a.copy() + res.data = func(a.data) + return res + + +@lru_cache(maxsize=None) +def _numba_d(func): + return njit(lambda *x: func(*x)) + + +def binary_op(func, a, b): + func = _numba_d(func) + return op_union_indices(func, a, b) + + +def op_union_indices( + op: Callable, a: scipy.sparse.csr_matrix, b: scipy.sparse.csr_matrix, *, default_value=0 +): + assert a.shape == b.shape + + if type(a) != type(b): + b = type(a)(b) + # a.sort_indices() + # b.sort_indices() + + # TODO: numpy is weird with bools here + out_dtype = np.array(op(a.data[0], b.data[0])).dtype + default_value = out_dtype.type(default_value) + return type(a)( + op_union_indices_csr_csr( + op, + a.indptr, + a.indices, + a.data, + b.indptr, + b.indices, + b.data, + out_dtype=out_dtype, + default_value=default_value, + ), + a.shape, + ) + + +@njit +def op_union_indices_csr_csr( + op: Callable, + a_indptr: np.ndarray, + a_indices: np.ndarray, + a_data: np.ndarray, + b_indptr: np.ndarray, + b_indices: np.ndarray, + b_data: np.ndarray, + out_dtype, + default_value, +): + out_indptr = np.zeros_like(a_indptr) + out_indices = np.zeros(len(a_indices) + len(b_indices), dtype=a_indices.dtype) + out_data = np.zeros(len(out_indices), dtype=out_dtype) + + out_idx = 0 + + for i in range(len(a_indptr) - 1): + + a_idx = a_indptr[i] + a_end = a_indptr[i + 1] + b_idx = b_indptr[i] + b_end = b_indptr[i + 1] + + while (a_idx < a_end) and (b_idx < b_end): + a_j = a_indices[a_idx] + b_j = b_indices[b_idx] + if a_j < b_j: + out_indices[out_idx] = a_j + out_data[out_idx] = op(a_data[a_idx], default_value) + a_idx += 1 + elif b_j < a_j: + out_indices[out_idx] = b_j + out_data[out_idx] = op(default_value, b_data[b_idx]) + b_idx += 1 + else: + out_indices[out_idx] = a_j + out_data[out_idx] = op(a_data[a_idx], b_data[b_idx]) + a_idx += 1 + b_idx += 1 + out_idx += 1 + + # Catch up the other set + while a_idx < a_end: + a_j = a_indices[a_idx] + out_indices[out_idx] = a_j + out_data[out_idx] = op(a_data[a_idx], default_value) + a_idx += 1 + out_idx += 1 + + while b_idx < b_end: + b_j = b_indices[b_idx] + out_indices[out_idx] = b_j + out_data[out_idx] = op(default_value, b_data[b_idx]) + b_idx += 1 + out_idx += 1 + + out_indptr[i + 1] = out_idx + + out_indices = out_indices[: out_idx] + out_data = out_data[: out_idx] + + return out_data, out_indices, out_indptr \ No newline at end of file diff --git a/sparse/_umath.py b/sparse/_umath.py index 1666c618..4ba1d1ec 100644 --- a/sparse/_umath.py +++ b/sparse/_umath.py @@ -405,6 +405,45 @@ def broadcast_to(x, shape): ) +# TODO: Figure out the right way to type this +# TODO: Figure out how to do 1d COO + CSR or CSC +def _resolve_result_type(args: "list[ArrayLike]") -> "Type": + from ._compressed import GCXS, CSR, CSC + from ._coo import COO + from ._dok import DOK + from ._compressed.compressed import _Compressed2d + + if all(isinstance(arg, DOK) for arg in args): + out_type = DOK + elif all(isinstance(arg, CSR) for arg in args): + out_type = CSR + elif all(isinstance(arg, CSC) for arg in args): + out_type = CSC + elif all(isinstance(arg, _Compressed2d) for arg in args): + out_type = CSR + elif all(isinstance(arg, GCXS) for arg in args): + out_type = GCXS + else: + out_type = COO + return out_type + + +def _from_scipy_sparse(a): + from ._compressed import CSR, CSC + from ._coo import COO + from ._dok import DOK + + assert isinstance(a, scipy.sparse.spmatrix) + if isinstance(a, scipy.sparse.csr_matrix): + return CSR(a) + elif isinstance(a, scipy.sparse.csc_matrix): + return CSC(a) + elif isinstance(a, scipy.sparse.dok_matrix): + return DOK(a) + else: + return COO(a) + + class _Elemwise: def __init__(self, func, *args, **kwargs): """ @@ -421,24 +460,20 @@ def __init__(self, func, *args, **kwargs): """ from ._coo import COO from ._sparse_array import SparseArray - from ._compressed import GCXS + from ._compressed import GCXS, CSR, CSC + from ._compressed.compressed import _Compressed2d from ._dok import DOK processed_args = [] - out_type = GCXS - - sparse_args = [arg for arg in args if isinstance(arg, SparseArray)] - - if all(isinstance(arg, DOK) for arg in sparse_args): - out_type = DOK - elif all(isinstance(arg, GCXS) for arg in sparse_args): - out_type = GCXS - else: - out_type = COO + # Should this happen before dispatch? + # Hmm, this may need major major changes. + # Case to consider: CSR or CSC + 1d COO for arg in args: if isinstance(arg, scipy.sparse.spmatrix): - processed_args.append(COO.from_scipy_sparse(arg)) + arg = _from_scipy_sparse(arg) + if isinstance(arg, _Compressed2d): + processed_args.append(arg) elif isscalar(arg) or isinstance(arg, np.ndarray): # Faster and more reliable to pass ()-shaped ndarrays as scalars. processed_args.append(np.asarray(arg)) @@ -450,7 +485,7 @@ def __init__(self, func, *args, **kwargs): else: processed_args.append(arg) - self.out_type = out_type + self.out_type = _resolve_result_type(processed_args) self.args = tuple(processed_args) self.func = func self.dtype = kwargs.pop("dtype", None) @@ -463,6 +498,7 @@ def __init__(self, func, *args, **kwargs): def get_result(self): from ._coo import COO + from ._compressed.compressed import _Compressed2d if self.args is None: return NotImplemented @@ -471,6 +507,9 @@ def get_result(self): args = [a.todense() if isinstance(a, COO) else a for a in self.args] return self.func(*args, **self.kwargs) + if issubclass(self.out_type, _Compressed2d): + return self._get_result_compressed_2d() + if any(s == 0 for s in self.shape): data = np.empty((0,), dtype=self.fill_value.dtype) coords = np.empty((0, len(self.shape)), dtype=np.intp) @@ -517,6 +556,29 @@ def get_result(self): fill_value=self.fill_value, ).asformat(self.out_type) + def _get_result_compressed_2d(self): + from ._compressed import elemwise as elemwise2d + from ._compressed.compressed import _Compressed2d + if len(self.args) == 1: + result = elemwise2d.op_unary(self.func, self.args[0]) + + processed_args = [] + for arg in self.args: + if isinstance(arg, self.out_type): + processed_args.append(arg) + elif isinstance(arg, _Compressed2d): + processed_args.append(self.out_type(arg)) + elif isinstance(arg, np.ndarray): + processed_args.append(np.broadcast_to(arg, self.shape)) + else: + raise NotImplementedError() + + if len(processed_args) == 2: + result = elemwise2d.binary_op(self.func, *processed_args) + + return result + + def _get_fill_value(self): """ A function that finds and returns the fill-value. diff --git a/sparse/tests/test_elemwise.py b/sparse/tests/test_elemwise.py index 4f5022ed..7fafc14b 100644 --- a/sparse/tests/test_elemwise.py +++ b/sparse/tests/test_elemwise.py @@ -3,7 +3,7 @@ import pytest import operator from sparse import COO, DOK -from sparse._compressed import GCXS +from sparse._compressed import GCXS, CSR, CSC from sparse._utils import assert_eq, random_value_array @@ -481,6 +481,72 @@ def test_leftside_elemwise_scalar(func, scalar, convert_to_np_number): assert_eq(fs, func(y, x)) +@pytest.mark.parametrize("func", [np.add, np.subtract, np.multiply]) +@pytest.mark.parametrize("type_a,type_b", [(CSR, CSR), (CSR, CSC), (CSR, np.asarray)]) +def test_2d_binary_op(func, type_a, type_b): + # TODO: implement COO.asformat(CSR) + from sparse import SparseArray + + # Doing the ugly conditional since this errors from explicit conversion to dense + # There's gotta be a better way to do this + if type_a != np.asarray: + a = type_a(sparse.random((20, 10), density=0.5)) + else: + a = sparse.random((20, 10), density=0.5).todense() + if type_b != np.asarray: + b = type_b(sparse.random((20, 10), density=0.5)) + else: + b = sparse.random((20, 10), density=0.5).todense() + + ref_a = a.todense() if isinstance(a, SparseArray) else a + ref_b = b.todense() if isinstance(b, SparseArray) else b + + expected = func(ref_a, ref_b) + result = func(a, b) + + assert_eq(expected, result) + + +from itertools import product + + +@pytest.mark.parametrize("func", [np.add, np.subtract, np.multiply]) +@pytest.mark.parametrize( + "a,b", + # TODO: would be nice if the tests would take names from these parameters + list( + product( + [ + # pytest.param(CSR(sparse.random((20, 10), density=0.5)), id="CSR"), + # pytest.param(CSC(sparse.random((20, 10), density=0.5)), id="CSC"), + # pytest.param(sparse.random((20, 10), density=0.5).todense(), id="COO"), + # pytest.param(sparse.random((20, 10), density=0.5, format=COO), id="dense-2d"), + # pytest.param(np.random.rand(20), id="dense-row"), + # pytest.param(np.random.rand(1, 10), id="dense-col"), + CSR(sparse.random((20, 10), density=0.5)), + CSC(sparse.random((20, 10), density=0.5)), + sparse.random((20, 10), density=0.5).todense(), + sparse.random((20, 10), density=0.5, format=COO), + np.random.rand(10), + np.random.rand(20, 1), + ], + repeat=2, + ) + ), +) +def test_2d_binary_op(func, a, b): + # TODO: implement COO.asformat(CSR) + from sparse import SparseArray + + ref_a = a.todense() if isinstance(a, SparseArray) else a + ref_b = b.todense() if isinstance(b, SparseArray) else b + + expected = func(ref_a, ref_b) + result = func(a, b) + + assert_eq(expected, result) + + @pytest.mark.parametrize( "func, scalar", [ From e312914f38691029691fb3ff848a717e9a5429b2 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 21 Apr 2021 21:30:14 +1000 Subject: [PATCH 02/10] Fixup test --- sparse/_compressed/elemwise.py | 7 ++++++- sparse/_umath.py | 3 ++- sparse/tests/test_elemwise.py | 15 ++++++++++++++- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/sparse/_compressed/elemwise.py b/sparse/_compressed/elemwise.py index 73db619d..b9ad48a0 100644 --- a/sparse/_compressed/elemwise.py +++ b/sparse/_compressed/elemwise.py @@ -5,6 +5,8 @@ import scipy.sparse from numba import njit +from .compressed import _Compressed2d + def op_unary(func, a): res = a.copy() @@ -19,7 +21,10 @@ def _numba_d(func): def binary_op(func, a, b): func = _numba_d(func) - return op_union_indices(func, a, b) + if isinstance(a, _Compressed2d) and isinstance(b, _Compressed2d): + return op_union_indices(func, a, b) + else: + raise NotImplementedError() def op_union_indices( diff --git a/sparse/_umath.py b/sparse/_umath.py index 4ba1d1ec..baeb5b44 100644 --- a/sparse/_umath.py +++ b/sparse/_umath.py @@ -589,9 +589,10 @@ def _get_fill_value(self): If the fill-value is inconsistent. """ from ._coo import COO + from ._sparse_array import SparseArray zero_args = tuple( - arg.fill_value[...] if isinstance(arg, COO) else arg for arg in self.args + arg.fill_value[...] if isinstance(arg, SparseArray) else arg for arg in self.args ) # Some elemwise functions require a dtype argument, some abhorr it. diff --git a/sparse/tests/test_elemwise.py b/sparse/tests/test_elemwise.py index 7fafc14b..5a33fd16 100644 --- a/sparse/tests/test_elemwise.py +++ b/sparse/tests/test_elemwise.py @@ -5,6 +5,7 @@ from sparse import COO, DOK from sparse._compressed import GCXS, CSR, CSC from sparse._utils import assert_eq, random_value_array +from sparse import SparseArray @pytest.mark.parametrize( @@ -508,7 +509,19 @@ def test_2d_binary_op(func, type_a, type_b): from itertools import product +from functools import singledispatch +@singledispatch +def asdense(x): + raise NotImplementedError() + +@asdense.register(SparseArray) +def _(x): + return x.todense() + +@asdense.register(np.ndarray) +def _(x): + return x @pytest.mark.parametrize("func", [np.add, np.subtract, np.multiply]) @pytest.mark.parametrize( @@ -544,7 +557,7 @@ def test_2d_binary_op(func, a, b): expected = func(ref_a, ref_b) result = func(a, b) - assert_eq(expected, result) + assert_eq(expected, asdense(result)) @pytest.mark.parametrize( From 91671c98cf52c127687499d37cf1047bbb1110c1 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 21 Apr 2021 21:38:46 +1000 Subject: [PATCH 03/10] Minor cleanup (might have broke something) --- sparse/_umath.py | 6 +++--- sparse/tests/test_elemwise.py | 27 +-------------------------- 2 files changed, 4 insertions(+), 29 deletions(-) diff --git a/sparse/_umath.py b/sparse/_umath.py index baeb5b44..fa17c69b 100644 --- a/sparse/_umath.py +++ b/sparse/_umath.py @@ -498,13 +498,14 @@ def __init__(self, func, *args, **kwargs): def get_result(self): from ._coo import COO + from ._sparse_array import SparseArray from ._compressed.compressed import _Compressed2d if self.args is None: return NotImplemented if self._dense_result: - args = [a.todense() if isinstance(a, COO) else a for a in self.args] + args = [a.todense() if isinstance(a, SparseArray) else a for a in self.args] return self.func(*args, **self.kwargs) if issubclass(self.out_type, _Compressed2d): @@ -588,7 +589,6 @@ def _get_fill_value(self): ValueError If the fill-value is inconsistent. """ - from ._coo import COO from ._sparse_array import SparseArray zero_args = tuple( @@ -609,7 +609,7 @@ def _get_fill_value(self): fill_value = fill_value_array[(0,) * fill_value_array.ndim] except IndexError: zero_args = tuple( - arg.fill_value if isinstance(arg, COO) else _zero_of_dtype(arg.dtype) + arg.fill_value if isinstance(arg, SparseArray) else _zero_of_dtype(arg.dtype) for arg in self.args ) fill_value = self.func(*zero_args, **self.kwargs)[()] diff --git a/sparse/tests/test_elemwise.py b/sparse/tests/test_elemwise.py index 5a33fd16..bb169568 100644 --- a/sparse/tests/test_elemwise.py +++ b/sparse/tests/test_elemwise.py @@ -482,32 +482,6 @@ def test_leftside_elemwise_scalar(func, scalar, convert_to_np_number): assert_eq(fs, func(y, x)) -@pytest.mark.parametrize("func", [np.add, np.subtract, np.multiply]) -@pytest.mark.parametrize("type_a,type_b", [(CSR, CSR), (CSR, CSC), (CSR, np.asarray)]) -def test_2d_binary_op(func, type_a, type_b): - # TODO: implement COO.asformat(CSR) - from sparse import SparseArray - - # Doing the ugly conditional since this errors from explicit conversion to dense - # There's gotta be a better way to do this - if type_a != np.asarray: - a = type_a(sparse.random((20, 10), density=0.5)) - else: - a = sparse.random((20, 10), density=0.5).todense() - if type_b != np.asarray: - b = type_b(sparse.random((20, 10), density=0.5)) - else: - b = sparse.random((20, 10), density=0.5).todense() - - ref_a = a.todense() if isinstance(a, SparseArray) else a - ref_b = b.todense() if isinstance(b, SparseArray) else b - - expected = func(ref_a, ref_b) - result = func(a, b) - - assert_eq(expected, result) - - from itertools import product from functools import singledispatch @@ -523,6 +497,7 @@ def _(x): def _(x): return x +# TODO: Add test for result types @pytest.mark.parametrize("func", [np.add, np.subtract, np.multiply]) @pytest.mark.parametrize( "a,b", From 7f2d2ed6738c6ffaf1b5fdf20185eeca701966f2 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 22 Apr 2021 15:55:31 +1000 Subject: [PATCH 04/10] temp print statement --- sparse/_umath.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sparse/_umath.py b/sparse/_umath.py index fa17c69b..b7a91214 100644 --- a/sparse/_umath.py +++ b/sparse/_umath.py @@ -411,8 +411,12 @@ def _resolve_result_type(args: "list[ArrayLike]") -> "Type": from ._compressed import GCXS, CSR, CSC from ._coo import COO from ._dok import DOK + from ._sparse_array import SparseArray from ._compressed.compressed import _Compressed2d + args = [arg for arg in args if isinstance(arg, SparseArray)] + print([type(arg) for arg in args]) + if all(isinstance(arg, DOK) for arg in args): out_type = DOK elif all(isinstance(arg, CSR) for arg in args): From 5ad790ec6a8f223aa65428aef80b7079bc7391df Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 22 Apr 2021 16:09:43 +1000 Subject: [PATCH 05/10] Fix return type determination --- sparse/_umath.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sparse/_umath.py b/sparse/_umath.py index b7a91214..eebfadaa 100644 --- a/sparse/_umath.py +++ b/sparse/_umath.py @@ -415,7 +415,6 @@ def _resolve_result_type(args: "list[ArrayLike]") -> "Type": from ._compressed.compressed import _Compressed2d args = [arg for arg in args if isinstance(arg, SparseArray)] - print([type(arg) for arg in args]) if all(isinstance(arg, DOK) for arg in args): out_type = DOK @@ -468,14 +467,15 @@ def __init__(self, func, *args, **kwargs): from ._compressed.compressed import _Compressed2d from ._dok import DOK + args = [arg if not isinstance(arg, scipy.sparse.spmatrix) else _from_scipy_sparse(arg) for arg in args] + processed_args = [] + self.out_type = _resolve_result_type(args) # Should this happen before dispatch? # Hmm, this may need major major changes. # Case to consider: CSR or CSC + 1d COO for arg in args: - if isinstance(arg, scipy.sparse.spmatrix): - arg = _from_scipy_sparse(arg) if isinstance(arg, _Compressed2d): processed_args.append(arg) elif isscalar(arg) or isinstance(arg, np.ndarray): @@ -489,7 +489,6 @@ def __init__(self, func, *args, **kwargs): else: processed_args.append(arg) - self.out_type = _resolve_result_type(processed_args) self.args = tuple(processed_args) self.func = func self.dtype = kwargs.pop("dtype", None) From 34ce2997afb14183f5048686c5385c4133445ae5 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 22 Apr 2021 19:08:53 +1000 Subject: [PATCH 06/10] Fix fill_value argument handling for CSC and CSR, handle scipy -> sparse dok conversion --- sparse/_compressed/compressed.py | 20 ++++++++++++++------ sparse/_umath.py | 18 +++++++++++++----- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/sparse/_compressed/compressed.py b/sparse/_compressed/compressed.py index 311b2c6f..33ab2252 100644 --- a/sparse/_compressed/compressed.py +++ b/sparse/_compressed/compressed.py @@ -16,6 +16,7 @@ can_store, check_zero_fill_value, check_compressed_axes, + _zero_of_dtype, equivalent, ) from .._coo.core import COO @@ -144,7 +145,7 @@ def __init__( shape=None, compressed_axes=None, prune=False, - fill_value=0, + fill_value=None, idx_dtype=None, ): @@ -171,6 +172,10 @@ def __init__( arg.fill_value, ) + self.data, self.indices, self.indptr = arg + + if fill_value is None: + fill_value = _zero_of_dtype(self.data.dtype) if shape is None: raise ValueError("missing `shape` argument") @@ -179,8 +184,6 @@ def __init__( if len(shape) == 1: compressed_axes = None - self.data, self.indices, self.indptr = arg - if self.data.ndim != 1: raise ValueError("data must be a scalar or 1-dimensional.") @@ -813,7 +816,12 @@ def _prune(self): class _Compressed2d(GCXS): def __init__( - self, arg, shape=None, compressed_axes=None, prune=False, fill_value=0 + self, + arg, + shape=None, + compressed_axes=None, + prune=False, + fill_value=None, ): if not hasattr(arg, "shape") and shape is None: raise ValueError("missing `shape` argument") @@ -856,7 +864,7 @@ class CSR(_Compressed2d): Sparse supports 2-D CSR. """ - def __init__(self, arg, shape=None, prune=False, fill_value=0): + def __init__(self, arg, shape=None, prune=False, fill_value=None): super().__init__(arg, shape=shape, compressed_axes=(0,), fill_value=fill_value) @classmethod @@ -881,7 +889,7 @@ class CSC(_Compressed2d): Sparse supports 2-D CSC. """ - def __init__(self, arg, shape=None, prune=False, fill_value=0): + def __init__(self, arg, shape=None, prune=False, fill_value=None): super().__init__(arg, shape=shape, compressed_axes=(1,), fill_value=fill_value) @classmethod diff --git a/sparse/_umath.py b/sparse/_umath.py index eebfadaa..e4457686 100644 --- a/sparse/_umath.py +++ b/sparse/_umath.py @@ -442,7 +442,7 @@ def _from_scipy_sparse(a): elif isinstance(a, scipy.sparse.csc_matrix): return CSC(a) elif isinstance(a, scipy.sparse.dok_matrix): - return DOK(a) + return DOK(a.shape, data=dict(a)) else: return COO(a) @@ -467,7 +467,12 @@ def __init__(self, func, *args, **kwargs): from ._compressed.compressed import _Compressed2d from ._dok import DOK - args = [arg if not isinstance(arg, scipy.sparse.spmatrix) else _from_scipy_sparse(arg) for arg in args] + args = [ + arg + if not isinstance(arg, scipy.sparse.spmatrix) + else _from_scipy_sparse(arg) + for arg in args + ] processed_args = [] @@ -563,6 +568,7 @@ def get_result(self): def _get_result_compressed_2d(self): from ._compressed import elemwise as elemwise2d from ._compressed.compressed import _Compressed2d + if len(self.args) == 1: result = elemwise2d.op_unary(self.func, self.args[0]) @@ -582,7 +588,6 @@ def _get_result_compressed_2d(self): return result - def _get_fill_value(self): """ A function that finds and returns the fill-value. @@ -595,7 +600,8 @@ def _get_fill_value(self): from ._sparse_array import SparseArray zero_args = tuple( - arg.fill_value[...] if isinstance(arg, SparseArray) else arg for arg in self.args + arg.fill_value[...] if isinstance(arg, SparseArray) else arg + for arg in self.args ) # Some elemwise functions require a dtype argument, some abhorr it. @@ -612,7 +618,9 @@ def _get_fill_value(self): fill_value = fill_value_array[(0,) * fill_value_array.ndim] except IndexError: zero_args = tuple( - arg.fill_value if isinstance(arg, SparseArray) else _zero_of_dtype(arg.dtype) + arg.fill_value + if isinstance(arg, SparseArray) + else _zero_of_dtype(arg.dtype) for arg in self.args ) fill_value = self.func(*zero_args, **self.kwargs)[()] From ccffedcb0556596e09297aab7b0991cb9447bb29 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 22 Apr 2021 19:15:55 +1000 Subject: [PATCH 07/10] Unbreak elemwise ops with CSR/CSC and COO --- sparse/_umath.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/_umath.py b/sparse/_umath.py index e4457686..2e5c9540 100644 --- a/sparse/_umath.py +++ b/sparse/_umath.py @@ -481,7 +481,7 @@ def __init__(self, func, *args, **kwargs): # Hmm, this may need major major changes. # Case to consider: CSR or CSC + 1d COO for arg in args: - if isinstance(arg, _Compressed2d): + if self.out_type != COO and isinstance(arg, _Compressed2d): processed_args.append(arg) elif isscalar(arg) or isinstance(arg, np.ndarray): # Faster and more reliable to pass ()-shaped ndarrays as scalars. From 30004b94b20096e92ba08258058024591e355cb8 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 22 Apr 2021 19:41:39 +1000 Subject: [PATCH 08/10] Skip some tests with densification --- sparse/tests/test_elemwise.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sparse/tests/test_elemwise.py b/sparse/tests/test_elemwise.py index bb169568..659feb71 100644 --- a/sparse/tests/test_elemwise.py +++ b/sparse/tests/test_elemwise.py @@ -524,8 +524,15 @@ def _(x): ) def test_2d_binary_op(func, a, b): # TODO: implement COO.asformat(CSR) + def _is_ndarray_1d(x): + return isinstance(x, np.ndarray) and sum(s != 1 for s in x.shape) <= 1 + from sparse import SparseArray + if func in [np.add, np.subtract] and (_is_ndarray_1d(a) or _is_ndarray_1d(b)): + # https://github.com/pydata/sparse/issues/460 + pytest.skip() + ref_a = a.todense() if isinstance(a, SparseArray) else a ref_b = b.todense() if isinstance(b, SparseArray) else b From c9510af87a4467eed3005d6fee257808b1cd0674 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sat, 24 Apr 2021 17:42:29 +1000 Subject: [PATCH 09/10] Resize array --- sparse/_compressed/elemwise.py | 74 ++++++++++++++++++++++------------ 1 file changed, 48 insertions(+), 26 deletions(-) diff --git a/sparse/_compressed/elemwise.py b/sparse/_compressed/elemwise.py index b9ad48a0..21347b36 100644 --- a/sparse/_compressed/elemwise.py +++ b/sparse/_compressed/elemwise.py @@ -40,8 +40,11 @@ def op_union_indices( # TODO: numpy is weird with bools here out_dtype = np.array(op(a.data[0], b.data[0])).dtype default_value = out_dtype.type(default_value) - return type(a)( - op_union_indices_csr_csr( + out_indptr = np.zeros_like(a.indptr) + out_indices = np.zeros(len(a.indices) + len(b.indices), dtype=np.promote_types(a.indices.dtype, b.indices.dtype)) + out_data = np.zeros(len(out_indices), dtype=out_dtype) + + nnz = op_union_indices_csr_csr( op, a.indptr, a.indices, @@ -49,11 +52,15 @@ def op_union_indices( b.indptr, b.indices, b.data, + out_indptr, + out_indices, + out_data, out_dtype=out_dtype, default_value=default_value, - ), - a.shape, - ) + ) + out_data.resize(nnz) + out_indices.resize(nnz) + return type(a)((out_data, out_indices, out_indptr), shape=a.shape) @njit @@ -65,12 +72,15 @@ def op_union_indices_csr_csr( b_indptr: np.ndarray, b_indices: np.ndarray, b_data: np.ndarray, + out_indptr: np.ndarray, + out_indices: np.ndarray, + out_data: np.ndarray, out_dtype, default_value, ): - out_indptr = np.zeros_like(a_indptr) - out_indices = np.zeros(len(a_indices) + len(b_indices), dtype=a_indices.dtype) - out_data = np.zeros(len(out_indices), dtype=out_dtype) + # out_indptr = np.zeros_like(a_indptr) + # out_indices = np.zeros(len(a_indices) + len(b_indices), dtype=a_indices.dtype) + # out_data = np.zeros(len(out_indices), dtype=out_dtype) out_idx = 0 @@ -85,38 +95,50 @@ def op_union_indices_csr_csr( a_j = a_indices[a_idx] b_j = b_indices[b_idx] if a_j < b_j: - out_indices[out_idx] = a_j - out_data[out_idx] = op(a_data[a_idx], default_value) + val = op(a_data[a_idx], default_value) + if val != default_value: + out_indices[out_idx] = a_j + out_data[out_idx] = val + out_idx += 1 a_idx += 1 elif b_j < a_j: - out_indices[out_idx] = b_j - out_data[out_idx] = op(default_value, b_data[b_idx]) + val = op(default_value, b_data[b_idx]) + if val != default_value: + out_indices[out_idx] = b_j + out_data[out_idx] = val + out_idx += 1 b_idx += 1 else: - out_indices[out_idx] = a_j - out_data[out_idx] = op(a_data[a_idx], b_data[b_idx]) + val = op(a_data[a_idx], b_data[b_idx]) + if val != default_value: + out_indices[out_idx] = a_j + out_data[out_idx] = val + out_idx += 1 a_idx += 1 b_idx += 1 - out_idx += 1 # Catch up the other set while a_idx < a_end: - a_j = a_indices[a_idx] - out_indices[out_idx] = a_j - out_data[out_idx] = op(a_data[a_idx], default_value) + val = op(a_data[a_idx], default_value) + if val != default_value: + out_indices[out_idx] = a_indices[a_idx] + out_data[out_idx] = val + out_idx += 1 a_idx += 1 - out_idx += 1 while b_idx < b_end: - b_j = b_indices[b_idx] - out_indices[out_idx] = b_j - out_data[out_idx] = op(default_value, b_data[b_idx]) + val = op(default_value, b_data[b_idx]) + if val != default_value: + out_indices[out_idx] = b_indices[b_idx] + out_data[out_idx] = val + out_idx += 1 b_idx += 1 - out_idx += 1 out_indptr[i + 1] = out_idx - out_indices = out_indices[: out_idx] - out_data = out_data[: out_idx] + # This may need to change to be "resize" to allow memory reallocation + # resize is currently not implemented in numba + # out_indices = out_indices[: out_idx] + # out_data = out_data[: out_idx] - return out_data, out_indices, out_indptr \ No newline at end of file + return out_idx \ No newline at end of file From c96cc1af788469ad87733097b36b7d866f493507 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Sat, 24 Apr 2021 19:37:31 +1000 Subject: [PATCH 10/10] Use pruning strategy from scipy instead of resizing (it seems a bit faster) --- sparse/_compressed/elemwise.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/sparse/_compressed/elemwise.py b/sparse/_compressed/elemwise.py index 21347b36..75c81afa 100644 --- a/sparse/_compressed/elemwise.py +++ b/sparse/_compressed/elemwise.py @@ -26,6 +26,17 @@ def binary_op(func, a, b): else: raise NotImplementedError() +# From scipy._util +def _prune_array(array): + """Return an array equivalent to the input array. If the input + array is a view of a much larger array, copy its contents to a + newly allocated array. Otherwise, return the input unchanged. + """ + if array.base is not None and array.size < array.base.size // 2: + return array.copy() + return array + + def op_union_indices( op: Callable, a: scipy.sparse.csr_matrix, b: scipy.sparse.csr_matrix, *, default_value=0 @@ -58,8 +69,8 @@ def op_union_indices( out_dtype=out_dtype, default_value=default_value, ) - out_data.resize(nnz) - out_indices.resize(nnz) + out_data = _prune_array(out_data[:nnz]) + out_indices = _prune_array(out_indices[:nnz]) return type(a)((out_data, out_indices, out_indptr), shape=a.shape) @@ -138,7 +149,7 @@ def op_union_indices_csr_csr( # This may need to change to be "resize" to allow memory reallocation # resize is currently not implemented in numba - # out_indices = out_indices[: out_idx] - # out_data = out_data[: out_idx] + out_indices = out_indices[: out_idx] + out_data = out_data[: out_idx] return out_idx \ No newline at end of file