Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(python): Enable Null datatype and null values by default in parametric testing #16192

Merged
merged 6 commits into from
May 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions _typos.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,17 @@ extend-ignore-identifiers-re = [
]

[default.extend-identifiers]
arange = "arange"
bck = "bck"
Fo = "Fo"
ND = "ND"
ba = "ba"
nd = "nd"
opt_nd = "opt_nd"
ser = "ser"
strat = "strat"
width_strat = "width_strat"

[default.extend-words]
iif = "iif"
arange = "arange"
strat = "strat"
'"r0ot"' = "r0ot"
wee = "wee"

Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/series/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def expr_dispatch(cls: type[T]) -> type[T]:
* Applied to the Series class, and/or any Series 'NameSpace' classes.
* Walks the class attributes, looking for methods that have empty function
bodies, with signatures compatible with an existing Expr function.
* IIF both conditions are met, the empty method is decorated with @call_expr.
* IFF both conditions are met, the empty method is decorated with @call_expr.
"""
# create lookup of expression functions in this namespace
namespace = getattr(cls, "_accessor", None)
Expand Down
47 changes: 26 additions & 21 deletions py-polars/polars/testing/parametric/strategies/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from polars._utils.deprecation import issue_deprecation_warning
from polars.dataframe import DataFrame
from polars.datatypes import DataType, DataTypeClass
from polars.datatypes import DataType, DataTypeClass, Null
from polars.series import Series
from polars.string_cache import StringCache
from polars.testing.parametric.strategies._utils import flexhash
Expand Down Expand Up @@ -39,7 +39,7 @@ def series( # noqa: D417
min_size: int = 0,
max_size: int = _ROW_LIMIT,
strategy: SearchStrategy[Any] | None = None,
allow_null: bool = False,
allow_null: bool = True,
unique: bool = False,
chunked: bool | None = None,
allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
Expand Down Expand Up @@ -68,7 +68,7 @@ def series( # noqa: D417
strategy : strategy, optional
supports overriding the default strategy for the given dtype.
allow_null : bool
Allow nulls as possible values.
Allow nulls as possible values and allow the `Null` data type by default.
unique : bool, optional
indicate whether Series values should all be distinct.
chunked : bool, optional
Expand Down Expand Up @@ -144,22 +144,28 @@ def series( # noqa: D417
allowed_dtypes = list(allowed_dtypes)
if isinstance(excluded_dtypes, (DataType, DataTypeClass)):
excluded_dtypes = [excluded_dtypes]
elif excluded_dtypes is not None and not isinstance(excluded_dtypes, Sequence):
excluded_dtypes = list(excluded_dtypes)
elif excluded_dtypes is not None:
if not isinstance(excluded_dtypes, list):
excluded_dtypes = list(excluded_dtypes)

if not allow_null and not (allowed_dtypes is not None and Null in allowed_dtypes):
if excluded_dtypes is None:
excluded_dtypes = [Null]
else:
excluded_dtypes.append(Null)

if strategy is None:
if dtype is None:
dtype = draw(
dtypes(allowed_dtypes=allowed_dtypes, excluded_dtypes=excluded_dtypes)
dtype_strat = dtypes(
allowed_dtypes=allowed_dtypes, excluded_dtypes=excluded_dtypes
)
else:
dtype = draw(
_instantiate_dtype(
dtype,
allowed_dtypes=allowed_dtypes,
excluded_dtypes=excluded_dtypes,
)
dtype_strat = _instantiate_dtype(
dtype,
allowed_dtypes=allowed_dtypes,
excluded_dtypes=excluded_dtypes,
)
dtype = draw(dtype_strat)

if size is None:
size = draw(st.integers(min_value=min_size, max_value=max_size))
Expand Down Expand Up @@ -213,7 +219,7 @@ def dataframes(
max_size: int = _ROW_LIMIT,
chunked: bool | None = None,
include_cols: Sequence[column] | column | None = None,
allow_null: bool | Mapping[str, bool] = False,
allow_null: bool | Mapping[str, bool] = True,
allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
**kwargs: Any,
Expand All @@ -232,7 +238,7 @@ def dataframes(
max_size: int = _ROW_LIMIT,
chunked: bool | None = None,
include_cols: Sequence[column] | column | None = None,
allow_null: bool | Mapping[str, bool] = False,
allow_null: bool | Mapping[str, bool] = True,
allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
**kwargs: Any,
Expand All @@ -253,7 +259,7 @@ def dataframes( # noqa: D417
max_size: int = _ROW_LIMIT,
chunked: bool | None = None,
include_cols: Sequence[column] | column | None = None,
allow_null: bool | Mapping[str, bool] = False,
allow_null: bool | Mapping[str, bool] = True,
allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
**kwargs: Any,
Expand Down Expand Up @@ -290,7 +296,8 @@ def dataframes( # noqa: D417
explicitly provided columns are appended onto the list of existing columns
(if any present).
allow_null : bool or Mapping[str, bool]
Allow nulls as possible values.
Allow nulls as possible values and allow the `Null` data type by default.
Accepts either a boolean or a mapping of column names to booleans.
allowed_dtypes : {list,set}, optional
when automatically generating data, allow only these dtypes.
excluded_dtypes : {list,set}, optional
Expand Down Expand Up @@ -404,12 +411,10 @@ def dataframes( # noqa: D417
c.name = f"col{idx}"
if c.allow_null is None:
if isinstance(allow_null, Mapping):
c.allow_null = allow_null.get(c.name, False)
c.allow_null = allow_null.get(c.name, True)
else:
c.allow_null = allow_null

# init dataframe from generated series data; series data is
# given as a python-native sequence.
with StringCache():
data = {
c.name: draw(
Expand Down Expand Up @@ -456,7 +461,7 @@ class column:
strategy : strategy, optional
supports overriding the default strategy for the given dtype.
allow_null : bool, optional
Allow nulls as possible values.
Allow nulls as possible values and allow the `Null` data type by default.
unique : bool, optional
flag indicating that all values generated for the column should be unique.

Expand Down
3 changes: 2 additions & 1 deletion py-polars/polars/testing/parametric/strategies/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,14 +349,15 @@ def data(
)
elif dtype == List:
inner = getattr(dtype, "inner", None) or Null()
strategy = lists(inner, **kwargs)
strategy = lists(inner, allow_null=allow_null, **kwargs)
elif dtype == Array:
inner = getattr(dtype, "inner", None) or Null()
width = getattr(dtype, "width", _DEFAULT_ARRAY_WIDTH_LIMIT)
strategy = lists(
inner,
min_len=width,
max_len=width,
allow_null=allow_null,
**kwargs,
)
else:
Expand Down
1 change: 1 addition & 0 deletions py-polars/polars/testing/parametric/strategies/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
Binary,
Date,
Time,
Null,
]
# Supported data type classes with arguments
_COMPLEX_DTYPES: list[DataTypeClass] = [
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/functions/range/test_date_range.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


def test_date_range() -> None:
# if low/high are both date, range is also be date _iif_ the granularity is >= 1d
# if low/high are both date, range is also be date _iff_ the granularity is >= 1d
result = pl.date_range(date(2022, 1, 1), date(2022, 3, 1), "1mo", eager=True)
assert result.to_list() == [date(2022, 1, 1), date(2022, 2, 1), date(2022, 3, 1)]

Expand Down
17 changes: 15 additions & 2 deletions py-polars/tests/unit/interchange/test_roundtrip.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,12 @@
]


@given(dataframes(allowed_dtypes=protocol_dtypes))
@given(
dataframes(
allowed_dtypes=protocol_dtypes,
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
)
)
def test_to_dataframe_pyarrow_parametric(df: pl.DataFrame) -> None:
dfi = df.__dataframe__()
df_pa = pa.interchange.from_dataframe(dfi)
Expand Down Expand Up @@ -71,7 +76,12 @@ def test_to_dataframe_pyarrow_zero_copy_parametric(df: pl.DataFrame) -> None:
@pytest.mark.filterwarnings(
"ignore:.*PEP3118 format string that does not match its itemsize:RuntimeWarning"
)
@given(dataframes(allowed_dtypes=protocol_dtypes))
@given(
dataframes(
allowed_dtypes=protocol_dtypes,
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
)
)
def test_to_dataframe_pandas_parametric(df: pl.DataFrame) -> None:
dfi = df.__dataframe__()
df_pd = pd.api.interchange.from_dataframe(dfi)
Expand All @@ -94,6 +104,7 @@ def test_to_dataframe_pandas_parametric(df: pl.DataFrame) -> None:
pl.Categorical,
],
chunked=False,
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
)
)
def test_to_dataframe_pandas_zero_copy_parametric(df: pl.DataFrame) -> None:
Expand Down Expand Up @@ -193,6 +204,7 @@ def test_from_dataframe_pandas_zero_copy_parametric(df: pl.DataFrame) -> None:
# Empty string columns cause an error due to a bug in pandas.
# https://github.com/pandas-dev/pandas/issues/56703
min_size=1,
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
)
)
def test_from_dataframe_pandas_native_parametric(df: pl.DataFrame) -> None:
Expand All @@ -217,6 +229,7 @@ def test_from_dataframe_pandas_native_parametric(df: pl.DataFrame) -> None:
# https://github.com/pandas-dev/pandas/issues/56700
min_size=1,
chunked=False,
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
)
)
def test_from_dataframe_pandas_native_zero_copy_parametric(df: pl.DataFrame) -> None:
Expand Down
17 changes: 13 additions & 4 deletions py-polars/tests/unit/interop/numpy/test_to_numpy_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,11 +326,19 @@ def test_series_to_numpy_temporal() -> None:

@given(
s=series(
min_size=1, max_size=10, excluded_dtypes=[pl.Categorical, pl.List, pl.Struct]
min_size=1,
max_size=10,
excluded_dtypes=[
pl.Categorical,
pl.List,
pl.Struct,
pl.Datetime("ms"),
pl.Duration("ms"),
],
allow_null=False,
).filter(
lambda s: (
getattr(s.dtype, "time_unit", None) != "ms"
and not (s.dtype == pl.String and s.str.contains("\x00").any())
not (s.dtype == pl.String and s.str.contains("\x00").any())
and not (s.dtype == pl.Binary and s.bin.contains(b"\x00").any())
)
),
Expand All @@ -345,8 +353,9 @@ def test_series_to_numpy(s: pl.Series) -> None:
pl.Datetime("us"): "datetime64[us]",
pl.Duration("ns"): "timedelta64[ns]",
pl.Duration("us"): "timedelta64[us]",
pl.Null(): "float32",
}
np_dtype = dtype_map.get(s.dtype) # type: ignore[call-overload]
np_dtype = dtype_map.get(s.dtype)
expected = np.array(values, dtype=np_dtype)

assert_array_equal(result, expected)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1350,7 +1350,7 @@ def test_series_duration_timeunits(


@given(
s=series(min_size=1, max_size=10, dtype=pl.Datetime),
s=series(min_size=1, max_size=10, dtype=pl.Datetime, allow_null=False),
)
def test_series_datetime_timeunits(
s: pl.Series,
Expand Down
1 change: 1 addition & 0 deletions py-polars/tests/unit/series/buffers/test_from_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
s=series(
allowed_dtypes=(pl.INTEGER_DTYPES | pl.FLOAT_DTYPES | {pl.Boolean}),
chunked=False,
allow_null=False,
)
)
def test_series_from_buffer(s: pl.Series) -> None:
Expand Down
32 changes: 24 additions & 8 deletions py-polars/tests/unit/testing/parametric/strategies/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
def test_series_defaults(s: pl.Series) -> None:
assert isinstance(s, pl.Series)
assert s.name == ""
assert s.null_count() == 0


@given(s=series(name="hello"))
Expand All @@ -39,7 +38,7 @@ def test_series_dtype(data: st.DataObject) -> None:
assert s.dtype == dtype


@given(s=series(dtype=pl.Enum))
@given(s=series(dtype=pl.Enum, allow_null=False))
@settings(max_examples=5)
def test_series_dtype_enum(s: pl.Series) -> None:
assert isinstance(s.dtype, pl.Enum)
Expand All @@ -58,9 +57,21 @@ def test_series_size_range(s: pl.Series) -> None:
assert 3 <= s.len() <= 8


@given(s=series(allow_null=True))
def test_series_allow_null(s: pl.Series) -> None:
assert 0 <= s.null_count() <= s.len()
@given(s=series(allow_null=False))
def test_series_allow_null_false(s: pl.Series) -> None:
assert s.null_count() == 0
assert s.dtype != pl.Null


@given(s=series(allowed_dtypes=[pl.Null], allow_null=False))
def test_series_allow_null_allowed_dtypes(s: pl.Series) -> None:
assert s.dtype == pl.Null


@given(s=series(allowed_dtypes=[pl.List(pl.Int8)], allow_null=False))
def test_series_allow_null_nested(s: pl.Series) -> None:
for v in s:
assert v.null_count() == 0


@given(df=dataframes())
Expand Down Expand Up @@ -121,6 +132,7 @@ def test_dataframes_allow_null_override(df: pl.DataFrame) -> None:
# generate lazyframes with at least one row
lazy=True,
min_size=1,
allow_null=False,
# test mix & match of bulk-assigned cols with custom cols
cols=[column(n, dtype=pl.UInt8, unique=True) for n in ["a", "b"]],
include_cols=[
Expand Down Expand Up @@ -190,7 +202,8 @@ def test_allow_infinities_deprecated(data: st.DataObject) -> None:
min_len=1,
),
),
]
],
allow_null=False,
),
)
def test_dataframes_nested_strategies(df: pl.DataFrame) -> None:
Expand Down Expand Up @@ -255,9 +268,12 @@ def test_chunking(

@given(
df=dataframes(
allowed_dtypes=[pl.Float32, pl.Float64], max_cols=4, allow_infinity=False
allowed_dtypes=[pl.Float32, pl.Float64],
max_cols=4,
allow_null=False,
allow_infinity=False,
),
s=series(dtype=pl.Float64, allow_infinity=False),
s=series(dtype=pl.Float64, allow_null=False, allow_infinity=False),
)
def test_infinities(
df: pl.DataFrame,
Expand Down