From 3821ed1a03256df2002f6d11e94c96fe8a59cca9 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Wed, 2 Oct 2024 11:43:27 +0200 Subject: [PATCH 1/3] fix: Bitwise aggregations should not ignore nulls --- crates/polars-arrow/src/array/mod.rs | 5 + crates/polars-compute/src/bitwise/mod.rs | 115 ++++++------------ .../src/chunked_array/ops/bitwise_reduce.rs | 54 +++----- crates/polars-python/Cargo.toml | 2 + .../polars-python/src/series/aggregation.rs | 33 +++++ py-polars/polars/series/series.py | 3 + .../tests/unit/operations/test_bitwise.py | 10 +- 7 files changed, 103 insertions(+), 119 deletions(-) diff --git a/crates/polars-arrow/src/array/mod.rs b/crates/polars-arrow/src/array/mod.rs index 1004e36f2514..08702e8021d3 100644 --- a/crates/polars-arrow/src/array/mod.rs +++ b/crates/polars-arrow/src/array/mod.rs @@ -98,6 +98,11 @@ pub trait Array: Send + Sync + dyn_clone::DynClone + 'static { .unwrap_or(0) } + #[inline] + fn has_nulls(&self) -> bool { + self.null_count() > 0 + } + /// Returns whether slot `i` is null. /// # Panic /// Panics iff `i >= self.len()`. diff --git a/crates/polars-compute/src/bitwise/mod.rs b/crates/polars-compute/src/bitwise/mod.rs index 578acb91b1ac..15f5a1212319 100644 --- a/crates/polars-compute/src/bitwise/mod.rs +++ b/crates/polars-compute/src/bitwise/mod.rs @@ -1,9 +1,8 @@ use std::convert::identity; -use arrow::array::{BooleanArray, PrimitiveArray}; +use arrow::array::{Array, BooleanArray, PrimitiveArray}; use arrow::datatypes::ArrowDataType; use arrow::legacy::utils::CustomIterTools; -use bytemuck::Zeroable; pub trait BitwiseKernel { type Scalar; @@ -36,8 +35,7 @@ macro_rules! impl_bitwise_kernel { fn count_ones(&self) -> PrimitiveArray { PrimitiveArray::new( ArrowDataType::UInt32, - self.values() - .iter() + self.values_iter() .map(|&v| $to_bits(v).count_ones()) .collect_trusted::>() .into(), @@ -49,9 +47,7 @@ macro_rules! impl_bitwise_kernel { fn count_zeros(&self) -> PrimitiveArray { PrimitiveArray::new( ArrowDataType::UInt32, - self - .values() - .iter() + self.values_iter() .map(|&v| $to_bits(v).count_zeros()) .collect_trusted::>() .into(), @@ -63,8 +59,7 @@ macro_rules! impl_bitwise_kernel { fn leading_ones(&self) -> PrimitiveArray { PrimitiveArray::new( ArrowDataType::UInt32, - self.values() - .iter() + self.values_iter() .map(|&v| $to_bits(v).leading_ones()) .collect_trusted::>() .into(), @@ -76,8 +71,7 @@ macro_rules! impl_bitwise_kernel { fn leading_zeros(&self) -> PrimitiveArray { PrimitiveArray::new( ArrowDataType::UInt32, - self.values() - .iter() + self.values_iter() .map(|&v| $to_bits(v).leading_zeros()) .collect_trusted::>() .into(), @@ -89,8 +83,7 @@ macro_rules! impl_bitwise_kernel { fn trailing_ones(&self) -> PrimitiveArray { PrimitiveArray::new( ArrowDataType::UInt32, - self.values() - .iter() + self.values_iter() .map(|&v| $to_bits(v).trailing_ones()) .collect_trusted::>() .into(), @@ -112,47 +105,29 @@ macro_rules! impl_bitwise_kernel { #[inline(never)] fn reduce_and(&self) -> Option { - if self.validity().map_or(false, |v| v.unset_bits() > 0) { - return None; + if !self.has_nulls() { + self.values_iter().copied().map($to_bits).reduce(|a, b| a & b).map($from_bits) + } else { + self.non_null_values_iter().map($to_bits).reduce(|a, b| a & b).map($from_bits) } - - let values = self.values(); - - if values.is_empty() { - return None; - } - - Some($from_bits(values.iter().fold(!$to_bits(<$T>::zeroed()), |a, &b| a & $to_bits(b)))) } #[inline(never)] fn reduce_or(&self) -> Option { - if self.validity().map_or(false, |v| v.unset_bits() > 0) { - return None; - } - - let values = self.values(); - - if values.is_empty() { - return None; + if !self.has_nulls() { + self.values_iter().copied().map($to_bits).reduce(|a, b| a | b).map($from_bits) + } else { + self.non_null_values_iter().map($to_bits).reduce(|a, b| a | b).map($from_bits) } - - Some($from_bits(values.iter().fold($to_bits(<$T>::zeroed()), |a, &b| a | $to_bits(b)))) } #[inline(never)] fn reduce_xor(&self) -> Option { - if self.validity().map_or(false, |v| v.unset_bits() > 0) { - return None; - } - - let values = self.values(); - - if values.is_empty() { - return None; + if !self.has_nulls() { + self.values_iter().copied().map($to_bits).reduce(|a, b| a ^ b).map($from_bits) + } else { + self.non_null_values_iter().map($to_bits).reduce(|a, b| a ^ b).map($from_bits) } - - Some($from_bits(values.iter().fold($to_bits(<$T>::zeroed()), |a, &b| a ^ $to_bits(b)))) } fn bit_and(lhs: Self::Scalar, rhs: Self::Scalar) -> Self::Scalar { @@ -189,8 +164,7 @@ impl BitwiseKernel for BooleanArray { fn count_ones(&self) -> PrimitiveArray { PrimitiveArray::new( ArrowDataType::UInt32, - self.values() - .iter() + self.values_iter() .map(u32::from) .collect_trusted::>() .into(), @@ -202,8 +176,7 @@ impl BitwiseKernel for BooleanArray { fn count_zeros(&self) -> PrimitiveArray { PrimitiveArray::new( ArrowDataType::UInt32, - self.values() - .iter() + self.values_iter() .map(|v| u32::from(!v)) .collect_trusted::>() .into(), @@ -232,45 +205,33 @@ impl BitwiseKernel for BooleanArray { } fn reduce_and(&self) -> Option { - if self.validity().map_or(false, |v| v.unset_bits() > 0) { - return None; + if self.len() == self.null_count() { + None + } else if !self.has_nulls() { + Some(self.values().unset_bits() == 0) + } else { + Some((self.values() & self.validity().unwrap()).unset_bits() == 0) } - - let values = self.values(); - - if values.is_empty() { - return None; - } - - Some(values.unset_bits() == 0) } fn reduce_or(&self) -> Option { - if self.validity().map_or(false, |v| v.unset_bits() > 0) { - return None; - } - - let values = self.values(); - - if values.is_empty() { - return None; + if self.len() == self.null_count() { + None + } else if !self.has_nulls() { + Some(self.values().set_bits() > 0) + } else { + Some((self.values() & self.validity().unwrap()).set_bits() > 0) } - - Some(values.set_bits() > 0) } fn reduce_xor(&self) -> Option { - if self.validity().map_or(false, |v| v.unset_bits() > 0) { - return None; - } - - let values = self.values(); - - if values.is_empty() { - return None; + if self.len() == self.null_count() { + None + } else if !self.has_nulls() { + Some(self.values().set_bits() % 2 == 1) + } else { + Some((self.values() & self.validity().unwrap()).set_bits() % 2 == 1) } - - Some(values.set_bits() % 2 == 1) } fn bit_and(lhs: Self::Scalar, rhs: Self::Scalar) -> Self::Scalar { diff --git a/crates/polars-core/src/chunked_array/ops/bitwise_reduce.rs b/crates/polars-core/src/chunked_array/ops/bitwise_reduce.rs index 5e033b53ab5d..867d04d878e6 100644 --- a/crates/polars-core/src/chunked_array/ops/bitwise_reduce.rs +++ b/crates/polars-core/src/chunked_array/ops/bitwise_reduce.rs @@ -1,4 +1,4 @@ -use arrow::array::{Array, PrimitiveArray}; +use arrow::array::PrimitiveArray; use arrow::types::NativeType; use polars_compute::bitwise::BitwiseKernel; @@ -13,33 +13,23 @@ where type Physical = T::Native; fn and_reduce(&self) -> Option { - if self.null_count() > 0 { - return None; - } - self.downcast_iter() - .filter(|arr| !arr.is_empty()) - .map(|arr| BitwiseKernel::reduce_and(arr).unwrap()) + .map(|arr| BitwiseKernel::reduce_and(arr)) + .flatten() .reduce( as BitwiseKernel>::bit_and) } - fn or_reduce(&self) -> Option { - if self.null_count() > 0 { - return None; - } + fn or_reduce(&self) -> Option { self.downcast_iter() - .filter(|arr| !arr.is_empty()) - .map(|arr| BitwiseKernel::reduce_or(arr).unwrap()) + .map(|arr| BitwiseKernel::reduce_or(arr)) + .flatten() .reduce( as BitwiseKernel>::bit_or) } - fn xor_reduce(&self) -> Option { - if self.null_count() > 0 { - return None; - } + fn xor_reduce(&self) -> Option { self.downcast_iter() - .filter(|arr| !arr.is_empty()) - .map(|arr| BitwiseKernel::reduce_xor(arr).unwrap()) + .map(|arr| BitwiseKernel::reduce_xor(arr)) + .flatten() .reduce( as BitwiseKernel>::bit_xor) } } @@ -48,33 +38,23 @@ impl ChunkBitwiseReduce for ChunkedArray { type Physical = bool; fn and_reduce(&self) -> Option { - if self.null_count() > 0 { - return None; - } - self.downcast_iter() - .filter(|arr| !arr.is_empty()) - .map(|arr| BitwiseKernel::reduce_and(arr).unwrap()) + .map(|arr| BitwiseKernel::reduce_and(arr)) + .flatten() .reduce(|a, b| a & b) } - fn or_reduce(&self) -> Option { - if self.null_count() > 0 { - return None; - } + fn or_reduce(&self) -> Option { self.downcast_iter() - .filter(|arr| !arr.is_empty()) - .map(|arr| BitwiseKernel::reduce_or(arr).unwrap()) + .map(|arr| BitwiseKernel::reduce_or(arr)) + .flatten() .reduce(|a, b| a | b) } - fn xor_reduce(&self) -> Option { - if self.null_count() > 0 { - return None; - } + fn xor_reduce(&self) -> Option { self.downcast_iter() - .filter(|arr| !arr.is_empty()) - .map(|arr| BitwiseKernel::reduce_xor(arr).unwrap()) + .map(|arr| BitwiseKernel::reduce_xor(arr)) + .flatten() .reduce(|a, b| a ^ b) } } diff --git a/crates/polars-python/Cargo.toml b/crates/polars-python/Cargo.toml index 1df741a4f51f..da4b992bfbec 100644 --- a/crates/polars-python/Cargo.toml +++ b/crates/polars-python/Cargo.toml @@ -162,6 +162,7 @@ peaks = ["polars/peaks"] hist = ["polars/hist"] find_many = ["polars/find_many"] new_streaming = ["polars-lazy/new_streaming"] +bitwise = ["polars/bitwise"] dtype-i8 = [] dtype-i16 = [] @@ -182,6 +183,7 @@ dtypes = [ operations = [ "array_any_all", "array_count", + "bitwise", "is_in", "repeat_by", "trigonometry", diff --git a/crates/polars-python/src/series/aggregation.rs b/crates/polars-python/src/series/aggregation.rs index ac324794564c..65fda49bb71e 100644 --- a/crates/polars-python/src/series/aggregation.rs +++ b/crates/polars-python/src/series/aggregation.rs @@ -145,4 +145,37 @@ impl PySeries { ) .into_py(py)) } + + #[cfg(feature = "bitwise")] + fn bitwise_and(&self, py: Python) -> PyResult { + Ok(Wrap( + self.series + .and_reduce() + .map_err(PyPolarsErr::from)? + .as_any_value(), + ) + .into_py(py)) + } + + #[cfg(feature = "bitwise")] + fn bitwise_or(&self, py: Python) -> PyResult { + Ok(Wrap( + self.series + .or_reduce() + .map_err(PyPolarsErr::from)? + .as_any_value(), + ) + .into_py(py)) + } + + #[cfg(feature = "bitwise")] + fn bitwise_xor(&self, py: Python) -> PyResult { + Ok(Wrap( + self.series + .xor_reduce() + .map_err(PyPolarsErr::from)? + .as_any_value(), + ) + .into_py(py)) + } } diff --git a/py-polars/polars/series/series.py b/py-polars/polars/series/series.py index 11c160e22874..d75bed5a3b56 100644 --- a/py-polars/polars/series/series.py +++ b/py-polars/polars/series/series.py @@ -7384,12 +7384,15 @@ def bitwise_trailing_zeros(self) -> Self: def bitwise_and(self) -> Self: """Perform an aggregation of bitwise ANDs.""" + return self._s.bitwise_and() def bitwise_or(self) -> Self: """Perform an aggregation of bitwise ORs.""" + return self._s.bitwise_or() def bitwise_xor(self) -> Self: """Perform an aggregation of bitwise XORs.""" + return self._s.bitwise_xor() # Keep the `list` and `str` properties below at the end of the definition of Series, # as to not confuse mypy with the type annotation `str` and `list` diff --git a/py-polars/tests/unit/operations/test_bitwise.py b/py-polars/tests/unit/operations/test_bitwise.py index e7a957fb534b..f701e64b1c70 100644 --- a/py-polars/tests/unit/operations/test_bitwise.py +++ b/py-polars/tests/unit/operations/test_bitwise.py @@ -182,8 +182,8 @@ def test_bit_aggregations(dtype: pl.DataType) -> None: def test_bit_group_by(dtype: pl.DataType) -> None: df = pl.DataFrame( [ - pl.Series("g", [1, 1, 2, 3, 2, 4, 4], pl.Int8), - pl.Series("a", [0x74, 0x1C, 0x05, None, 0x70, 0x01, None], dtype), + pl.Series("g", [4, 1, 1, 2, 3, 2, 4, 4], pl.Int8), + pl.Series("a", [0x03, 0x74, 0x1C, 0x05, None, 0x70, 0x01, None], dtype), ] ) @@ -198,9 +198,9 @@ def test_bit_group_by(dtype: pl.DataType) -> None: pl.DataFrame( [ pl.Series("g", [1, 2, 3, 4], pl.Int8), - pl.Series("AND", [0x74 & 0x1C, 0x05 & 0x70, None, None], dtype), - pl.Series("OR", [0x74 | 0x1C, 0x05 | 0x70, None, None], dtype), - pl.Series("XOR", [0x74 ^ 0x1C, 0x05 ^ 0x70, None, None], dtype), + pl.Series("AND", [0x74 & 0x1C, 0x05 & 0x70, None, 0x01], dtype), + pl.Series("OR", [0x74 | 0x1C, 0x05 | 0x70, None, 0x03], dtype), + pl.Series("XOR", [0x74 ^ 0x1C, 0x05 ^ 0x70, None, 0x02], dtype), ] ), check_row_order=False, From fcc3528c4b245b9d06b5d09e607c535aa5260bdd Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Wed, 2 Oct 2024 11:48:29 +0200 Subject: [PATCH 2/3] clippy --- .../src/chunked_array/ops/bitwise_reduce.rs | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/crates/polars-core/src/chunked_array/ops/bitwise_reduce.rs b/crates/polars-core/src/chunked_array/ops/bitwise_reduce.rs index 867d04d878e6..d69dfa2f5d63 100644 --- a/crates/polars-core/src/chunked_array/ops/bitwise_reduce.rs +++ b/crates/polars-core/src/chunked_array/ops/bitwise_reduce.rs @@ -14,22 +14,19 @@ where fn and_reduce(&self) -> Option { self.downcast_iter() - .map(|arr| BitwiseKernel::reduce_and(arr)) - .flatten() + .filter_map(BitwiseKernel::reduce_and) .reduce( as BitwiseKernel>::bit_and) } fn or_reduce(&self) -> Option { self.downcast_iter() - .map(|arr| BitwiseKernel::reduce_or(arr)) - .flatten() + .filter_map(BitwiseKernel::reduce_or) .reduce( as BitwiseKernel>::bit_or) } fn xor_reduce(&self) -> Option { self.downcast_iter() - .map(|arr| BitwiseKernel::reduce_xor(arr)) - .flatten() + .filter_map(BitwiseKernel::reduce_xor) .reduce( as BitwiseKernel>::bit_xor) } } @@ -39,22 +36,19 @@ impl ChunkBitwiseReduce for ChunkedArray { fn and_reduce(&self) -> Option { self.downcast_iter() - .map(|arr| BitwiseKernel::reduce_and(arr)) - .flatten() + .filter_map(BitwiseKernel::reduce_and) .reduce(|a, b| a & b) } fn or_reduce(&self) -> Option { self.downcast_iter() - .map(|arr| BitwiseKernel::reduce_or(arr)) - .flatten() + .filter_map(BitwiseKernel::reduce_or) .reduce(|a, b| a | b) } fn xor_reduce(&self) -> Option { self.downcast_iter() - .map(|arr| BitwiseKernel::reduce_xor(arr)) - .flatten() + .filter_map(BitwiseKernel::reduce_xor) .reduce(|a, b| a ^ b) } } From d9f9d17a1f7d69a68f171a4a17bfab09fd789d08 Mon Sep 17 00:00:00 2001 From: Orson Peters Date: Wed, 2 Oct 2024 11:48:57 +0200 Subject: [PATCH 3/3] add missing leading_zeros and move bitwise_ aggregations to aggregations section --- py-polars/docs/source/reference/expressions/aggregation.rst | 3 +++ py-polars/docs/source/reference/expressions/computation.rst | 4 +--- py-polars/docs/source/reference/series/computation.rst | 1 + 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/py-polars/docs/source/reference/expressions/aggregation.rst b/py-polars/docs/source/reference/expressions/aggregation.rst index 1162f9a25ac3..123111120a73 100644 --- a/py-polars/docs/source/reference/expressions/aggregation.rst +++ b/py-polars/docs/source/reference/expressions/aggregation.rst @@ -12,6 +12,9 @@ Aggregation Expr.approx_n_unique Expr.arg_max Expr.arg_min + Expr.bitwise_and + Expr.bitwise_or + Expr.bitwise_xor Expr.count Expr.first Expr.implode diff --git a/py-polars/docs/source/reference/expressions/computation.rst b/py-polars/docs/source/reference/expressions/computation.rst index 4ad8e68a1bfd..4b488ec0d692 100644 --- a/py-polars/docs/source/reference/expressions/computation.rst +++ b/py-polars/docs/source/reference/expressions/computation.rst @@ -18,11 +18,9 @@ Computation Expr.bitwise_count_ones Expr.bitwise_count_zeros Expr.bitwise_leading_ones + Expr.bitwise_leading_zeros Expr.bitwise_trailing_ones Expr.bitwise_trailing_zeros - Expr.bitwise_and - Expr.bitwise_or - Expr.bitwise_xor Expr.cbrt Expr.cos Expr.cosh diff --git a/py-polars/docs/source/reference/series/computation.rst b/py-polars/docs/source/reference/series/computation.rst index 887fed5b0ec2..5ab80238a54d 100644 --- a/py-polars/docs/source/reference/series/computation.rst +++ b/py-polars/docs/source/reference/series/computation.rst @@ -18,6 +18,7 @@ Computation Series.bitwise_count_ones Series.bitwise_count_zeros Series.bitwise_leading_ones + Series.bitwise_leading_zeros Series.bitwise_trailing_ones Series.bitwise_trailing_zeros Series.bitwise_and