Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix!: Fix NaN ordering to make NaNs compare greater than any other float, and equal to themselves #12721

Merged
merged 35 commits into from
Nov 30, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
434a7d7
move TotalOrd to polars-utils
orlp Oct 16, 2023
a309b3b
add polars-compute
orlp Oct 16, 2023
eaed1e7
wip
orlp Oct 17, 2023
c1912d6
remove nans_compare_equal
orlp Nov 23, 2023
913a4f1
remove float comparison exceptions
orlp Nov 23, 2023
d37d9c0
mostly fix broadcasting comparisons
orlp Nov 24, 2023
01fb1bb
remove inconsistent null equality optimization
orlp Nov 27, 2023
33f09b9
add warning to always-null comparisons
orlp Nov 27, 2023
3c5e101
fmt
orlp Nov 27, 2023
14e43eb
fix warnings in tests
orlp Nov 27, 2023
5c4a943
fix _missing comparison ops
orlp Nov 27, 2023
467b7fc
clippy
orlp Nov 27, 2023
d7fb115
remove not_equal_and_validity
orlp Nov 28, 2023
5354923
add new string comparison kernels
orlp Nov 28, 2023
da58d14
define gt/ge in terms of lt/le
orlp Nov 28, 2023
74f4e3f
add _missing kernels
orlp Nov 28, 2023
b6359c4
add array support to comparison kernels
orlp Nov 28, 2023
a50997d
fmt/clippy
orlp Nov 28, 2023
5a2c321
add boolean comparison kernels
orlp Nov 29, 2023
06b9426
expand comparison tests
orlp Nov 29, 2023
80a12d9
fix test
orlp Nov 29, 2023
0b93380
user new string broadcast comparison kernels
orlp Nov 29, 2023
ed030c3
remove old comparison kernels
orlp Nov 29, 2023
ba409a4
clippy
orlp Nov 29, 2023
ea99583
fix bad/outdated tests
orlp Nov 29, 2023
9013166
fix trait bounds
orlp Nov 29, 2023
3c63fec
fix conditional import
orlp Nov 29, 2023
cb4da95
fix another bad test
orlp Nov 29, 2023
1bd4957
fix failing doctest
orlp Nov 29, 2023
ba8c5e2
address review comments
orlp Nov 30, 2023
d58dc7c
fix mypy
orlp Nov 30, 2023
e2e8b85
fix incorrect bitcount
orlp Nov 30, 2023
3b00eb1
add missing inline
orlp Nov 30, 2023
733c634
add missing comment
orlp Nov 30, 2023
7a340e4
fmt
orlp Nov 30, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add array support to comparison kernels
  • Loading branch information
orlp committed Nov 28, 2023
commit b6359c4cf64d8d38895f311cb0e7c42bf554d355
15 changes: 12 additions & 3 deletions crates/polars-arrow/src/bitmap/immutable.rs
Original file line number Diff line number Diff line change
Expand Up @@ -282,9 +282,18 @@ impl Bitmap {
/// Initializes an new [`Bitmap`] filled with unset values.
#[inline]
pub fn new_zeroed(length: usize) -> Self {
// don't use `MutableBitmap::from_len_zeroed().into()`
// it triggers a bitcount
let bytes = vec![0; length.saturating_add(7) / 8];
Self::new_with_value(false, length)
}

/// Initializes an new [`Bitmap`] filled with the given value.
#[inline]
pub fn new_with_value(value: bool, length: usize) -> Self {
// Don't use `MutableBitmap::from_len_zeroed().into()`, it triggers a bitcount.
let bytes = if value {
vec![u8::MAX; length.saturating_add(7) / 8]
} else {
vec![0; length.saturating_add(7) / 8]
};
unsafe { Bitmap::from_inner_unchecked(Arc::new(bytes.into()), 0, length, length) }
orlp marked this conversation as resolved.
Show resolved Hide resolved
}

Expand Down
1 change: 1 addition & 0 deletions crates/polars-compute/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,4 @@ version_check = { workspace = true }
[features]
nightly = []
simd = []
dtype-array = []
119 changes: 119 additions & 0 deletions crates/polars-compute/src/comparisons/array.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
use arrow::array::{Array, BinaryArray, FixedSizeListArray, PrimitiveArray, Utf8Array};
use arrow::bitmap::utils::count_zeros;
use arrow::bitmap::Bitmap;
use arrow::datatypes::ArrowDataType;

use crate::comparisons::TotalOrdKernel;

fn agg_array_bitmap<F>(bm: Bitmap, width: usize, true_zero_count: F) -> Bitmap
where
orlp marked this conversation as resolved.
Show resolved Hide resolved
F: Fn(usize) -> bool,
{
assert!(width > 0 && bm.len() % width == 0);
let (slice, offset, _len) = bm.as_slice();

(0..bm.len() / width)
.map(|i| true_zero_count(count_zeros(slice, offset + i * width, width)))
.collect()
}

macro_rules! call_binary {
($T:ty, $lhs:expr, $rhs:expr, $op:path) => {{
let lhs: &$T = $lhs.as_any().downcast_ref().unwrap();
let rhs: &$T = $rhs.as_any().downcast_ref().unwrap();
$op(lhs, rhs)
}};
}

macro_rules! compare {
($lhs:expr, $rhs:expr, $wrong_width:expr, $op:path) => {{
let lhs = $lhs;
let rhs = $rhs;
assert_eq!(lhs.len(), rhs.len());
let ArrowDataType::FixedSizeList(lhs_type, lhs_width) = lhs.data_type().to_logical_type() else {
panic!("array comparison called with non-array type");
};
let ArrowDataType::FixedSizeList(rhs_type, rhs_width) = rhs.data_type().to_logical_type() else {
panic!("array comparison called with non-array type");
};
assert_eq!(lhs_type.data_type(), rhs_type.data_type());

if lhs_width != rhs_width {
return Bitmap::new_with_value($wrong_width, lhs.len());
}

use arrow::datatypes::PhysicalType::*;
use arrow::datatypes::PrimitiveType::*;
let lv = lhs.values();
let rv = rhs.values();
match lhs_type.data_type().to_physical_type() {
// Boolean => call_binary!(BooleanArray, lhs, rhs, $op),
Boolean => todo!(),
LargeUtf8 => call_binary!(Utf8Array<i64>, lv, rv, $op),
LargeBinary => call_binary!(BinaryArray<i64>, lv, rv, $op),
Primitive(Int8) => call_binary!(PrimitiveArray<i8>, lv, rv, $op),
Primitive(Int16) => call_binary!(PrimitiveArray<i16>, lv, rv, $op),
Primitive(Int32) => call_binary!(PrimitiveArray<i32>, lv, rv, $op),
Primitive(Int64) => call_binary!(PrimitiveArray<i64>, lv, rv, $op),
Primitive(Int128) => call_binary!(PrimitiveArray<i128>, lv, rv, $op),
Primitive(UInt8) => call_binary!(PrimitiveArray<u8>, lv, rv, $op),
Primitive(UInt16) => call_binary!(PrimitiveArray<u16>, lv, rv, $op),
Primitive(UInt32) => call_binary!(PrimitiveArray<u32>, lv, rv, $op),
Primitive(UInt64) => call_binary!(PrimitiveArray<i64>, lv, rv, $op),
Primitive(Float32) => call_binary!(PrimitiveArray<f32>, lv, rv, $op),
Primitive(Float64) => call_binary!(PrimitiveArray<f64>, lv, rv, $op),
_ => todo!(
"Comparison between {:?} are not yet supported",
lhs.data_type().to_physical_type()
),
}
}};
}

impl TotalOrdKernel for FixedSizeListArray {
type Scalar = Box<dyn Array>;

fn tot_eq_kernel(&self, other: &Self) -> Bitmap {
// Nested comparison always done with eq_missing, propagating doesn't
// make any sense.
let inner = compare!(self, other, false, TotalOrdKernel::tot_eq_missing_kernel);
agg_array_bitmap(inner, self.size(), |zeroes| zeroes == 0)
}

fn tot_ne_kernel(&self, other: &Self) -> Bitmap {
let inner = compare!(self, other, true, TotalOrdKernel::tot_eq_missing_kernel);
agg_array_bitmap(inner, self.size(), |zeroes| zeroes > 0)
}

fn tot_lt_kernel(&self, _other: &Self) -> Bitmap {
unimplemented!()
}

fn tot_le_kernel(&self, _other: &Self) -> Bitmap {
unimplemented!()
}

fn tot_eq_kernel_broadcast(&self, _other: &Self::Scalar) -> Bitmap {
todo!()
}

fn tot_ne_kernel_broadcast(&self, _other: &Self::Scalar) -> Bitmap {
todo!()
}

fn tot_lt_kernel_broadcast(&self, _other: &Self::Scalar) -> Bitmap {
unimplemented!()
}

fn tot_le_kernel_broadcast(&self, _other: &Self::Scalar) -> Bitmap {
unimplemented!()
}

fn tot_gt_kernel_broadcast(&self, _other: &Self::Scalar) -> Bitmap {
unimplemented!()
}

fn tot_ge_kernel_broadcast(&self, _other: &Self::Scalar) -> Bitmap {
unimplemented!()
}
}
3 changes: 3 additions & 0 deletions crates/polars-compute/src/comparisons/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,6 @@ mod scalar;

#[cfg(feature = "simd")]
mod simd;

#[cfg(feature = "dtype-array")]
mod array;
2 changes: 1 addition & 1 deletion crates/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ dtype-date = ["temporal"]
dtype-datetime = ["temporal"]
dtype-duration = ["temporal"]
dtype-time = ["temporal"]
dtype-array = ["arrow/dtype-array"]
dtype-array = ["arrow/dtype-array", "polars-compute/dtype-array"]
dtype-i8 = []
dtype-i16 = []
dtype-decimal = ["dep:itoap", "arrow/dtype-decimal"]
Expand Down
28 changes: 4 additions & 24 deletions crates/polars-core/src/chunked_array/comparison/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -703,48 +703,28 @@ impl ChunkCompare<&ArrayChunked> for ArrayChunked {
if self.width() != rhs.width() {
return BooleanChunked::full("", false, self.len());
}
arity::binary_mut_with_options(
self,
rhs,
arrow::legacy::kernels::comparison::fixed_size_list_eq,
"",
)
arity::binary_mut_values(self, rhs, |a, b| a.tot_eq_kernel(b).into(), "")
}

fn equal_missing(&self, rhs: &ArrayChunked) -> BooleanChunked {
if self.width() != rhs.width() {
return BooleanChunked::full("", false, self.len());
}
arity::binary_mut_with_options(
self,
rhs,
arrow::legacy::kernels::comparison::fixed_size_list_eq_missing,
"",
)
arity::binary_mut_with_options(self, rhs, |a, b| a.tot_eq_missing_kernel(b).into(), "")
}

fn not_equal(&self, rhs: &ArrayChunked) -> BooleanChunked {
if self.width() != rhs.width() {
return BooleanChunked::full("", true, self.len());
}
arity::binary_mut_with_options(
self,
rhs,
arrow::legacy::kernels::comparison::fixed_size_list_neq,
"",
)
arity::binary_mut_values(self, rhs, |a, b| a.tot_ne_kernel(b).into(), "")
}

fn not_equal_missing(&self, rhs: &ArrayChunked) -> Self::Item {
if self.width() != rhs.width() {
return BooleanChunked::full("", true, self.len());
}
arity::binary_mut_with_options(
self,
rhs,
arrow::legacy::kernels::comparison::fixed_size_list_neq_missing,
"",
)
arity::binary_mut_with_options(self, rhs, |a, b| a.tot_ne_missing_kernel(b).into(), "")
}

// following are not implemented because gt, lt comparison of series don't make sense
Expand Down
6 changes: 4 additions & 2 deletions py-polars/tests/unit/datatypes/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,10 +119,12 @@ def test_array_equal_and_not_equal() -> None:

left = pl.Series([[1, None], [3, None]], dtype=pl.Array(width=2, inner=pl.Int64))
right = pl.Series([[1, None], [3, 4]], dtype=pl.Array(width=2, inner=pl.Int64))
assert_series_equal(left == right, pl.Series([False, False]))
assert_series_equal(left == right, pl.Series([True, False]))
assert_series_equal(left.eq_missing(right), pl.Series([True, False]))
assert_series_equal(left != right, pl.Series([True, True]))
assert_series_equal(left != right, pl.Series([False, True]))
assert_series_equal(left.ne_missing(right), pl.Series([False, True]))

# TODO: test eq_missing with nulled arrays, rather than null elements.


def test_array_init_deprecation() -> None:
Expand Down