Skip to content

Commit

Permalink
feat(python): Enable Null datatype and null values by default in para…
Browse files Browse the repository at this point in the history
…metric testing (pola-rs#16192)
  • Loading branch information
stinodego authored May 13, 2024
1 parent 8066952 commit f6b4f48
Show file tree
Hide file tree
Showing 11 changed files with 87 additions and 43 deletions.
6 changes: 2 additions & 4 deletions _typos.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,17 @@ extend-ignore-identifiers-re = [
]

[default.extend-identifiers]
arange = "arange"
bck = "bck"
Fo = "Fo"
ND = "ND"
ba = "ba"
nd = "nd"
opt_nd = "opt_nd"
ser = "ser"
strat = "strat"
width_strat = "width_strat"

[default.extend-words]
iif = "iif"
arange = "arange"
strat = "strat"
'"r0ot"' = "r0ot"
wee = "wee"

Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/series/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def expr_dispatch(cls: type[T]) -> type[T]:
* Applied to the Series class, and/or any Series 'NameSpace' classes.
* Walks the class attributes, looking for methods that have empty function
bodies, with signatures compatible with an existing Expr function.
* IIF both conditions are met, the empty method is decorated with @call_expr.
* IFF both conditions are met, the empty method is decorated with @call_expr.
"""
# create lookup of expression functions in this namespace
namespace = getattr(cls, "_accessor", None)
Expand Down
47 changes: 26 additions & 21 deletions py-polars/polars/testing/parametric/strategies/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from polars._utils.deprecation import issue_deprecation_warning
from polars.dataframe import DataFrame
from polars.datatypes import DataType, DataTypeClass
from polars.datatypes import DataType, DataTypeClass, Null
from polars.series import Series
from polars.string_cache import StringCache
from polars.testing.parametric.strategies._utils import flexhash
Expand Down Expand Up @@ -39,7 +39,7 @@ def series( # noqa: D417
min_size: int = 0,
max_size: int = _ROW_LIMIT,
strategy: SearchStrategy[Any] | None = None,
allow_null: bool = False,
allow_null: bool = True,
unique: bool = False,
chunked: bool | None = None,
allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
Expand Down Expand Up @@ -68,7 +68,7 @@ def series( # noqa: D417
strategy : strategy, optional
supports overriding the default strategy for the given dtype.
allow_null : bool
Allow nulls as possible values.
Allow nulls as possible values and allow the `Null` data type by default.
unique : bool, optional
indicate whether Series values should all be distinct.
chunked : bool, optional
Expand Down Expand Up @@ -144,22 +144,28 @@ def series( # noqa: D417
allowed_dtypes = list(allowed_dtypes)
if isinstance(excluded_dtypes, (DataType, DataTypeClass)):
excluded_dtypes = [excluded_dtypes]
elif excluded_dtypes is not None and not isinstance(excluded_dtypes, Sequence):
excluded_dtypes = list(excluded_dtypes)
elif excluded_dtypes is not None:
if not isinstance(excluded_dtypes, list):
excluded_dtypes = list(excluded_dtypes)

if not allow_null and not (allowed_dtypes is not None and Null in allowed_dtypes):
if excluded_dtypes is None:
excluded_dtypes = [Null]
else:
excluded_dtypes.append(Null)

if strategy is None:
if dtype is None:
dtype = draw(
dtypes(allowed_dtypes=allowed_dtypes, excluded_dtypes=excluded_dtypes)
dtype_strat = dtypes(
allowed_dtypes=allowed_dtypes, excluded_dtypes=excluded_dtypes
)
else:
dtype = draw(
_instantiate_dtype(
dtype,
allowed_dtypes=allowed_dtypes,
excluded_dtypes=excluded_dtypes,
)
dtype_strat = _instantiate_dtype(
dtype,
allowed_dtypes=allowed_dtypes,
excluded_dtypes=excluded_dtypes,
)
dtype = draw(dtype_strat)

if size is None:
size = draw(st.integers(min_value=min_size, max_value=max_size))
Expand Down Expand Up @@ -213,7 +219,7 @@ def dataframes(
max_size: int = _ROW_LIMIT,
chunked: bool | None = None,
include_cols: Sequence[column] | column | None = None,
allow_null: bool | Mapping[str, bool] = False,
allow_null: bool | Mapping[str, bool] = True,
allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
**kwargs: Any,
Expand All @@ -232,7 +238,7 @@ def dataframes(
max_size: int = _ROW_LIMIT,
chunked: bool | None = None,
include_cols: Sequence[column] | column | None = None,
allow_null: bool | Mapping[str, bool] = False,
allow_null: bool | Mapping[str, bool] = True,
allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
**kwargs: Any,
Expand All @@ -253,7 +259,7 @@ def dataframes( # noqa: D417
max_size: int = _ROW_LIMIT,
chunked: bool | None = None,
include_cols: Sequence[column] | column | None = None,
allow_null: bool | Mapping[str, bool] = False,
allow_null: bool | Mapping[str, bool] = True,
allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
**kwargs: Any,
Expand Down Expand Up @@ -290,7 +296,8 @@ def dataframes( # noqa: D417
explicitly provided columns are appended onto the list of existing columns
(if any present).
allow_null : bool or Mapping[str, bool]
Allow nulls as possible values.
Allow nulls as possible values and allow the `Null` data type by default.
Accepts either a boolean or a mapping of column names to booleans.
allowed_dtypes : {list,set}, optional
when automatically generating data, allow only these dtypes.
excluded_dtypes : {list,set}, optional
Expand Down Expand Up @@ -404,12 +411,10 @@ def dataframes( # noqa: D417
c.name = f"col{idx}"
if c.allow_null is None:
if isinstance(allow_null, Mapping):
c.allow_null = allow_null.get(c.name, False)
c.allow_null = allow_null.get(c.name, True)
else:
c.allow_null = allow_null

# init dataframe from generated series data; series data is
# given as a python-native sequence.
with StringCache():
data = {
c.name: draw(
Expand Down Expand Up @@ -456,7 +461,7 @@ class column:
strategy : strategy, optional
supports overriding the default strategy for the given dtype.
allow_null : bool, optional
Allow nulls as possible values.
Allow nulls as possible values and allow the `Null` data type by default.
unique : bool, optional
flag indicating that all values generated for the column should be unique.
Expand Down
3 changes: 2 additions & 1 deletion py-polars/polars/testing/parametric/strategies/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,14 +349,15 @@ def data(
)
elif dtype == List:
inner = getattr(dtype, "inner", None) or Null()
strategy = lists(inner, **kwargs)
strategy = lists(inner, allow_null=allow_null, **kwargs)
elif dtype == Array:
inner = getattr(dtype, "inner", None) or Null()
width = getattr(dtype, "width", _DEFAULT_ARRAY_WIDTH_LIMIT)
strategy = lists(
inner,
min_len=width,
max_len=width,
allow_null=allow_null,
**kwargs,
)
else:
Expand Down
1 change: 1 addition & 0 deletions py-polars/polars/testing/parametric/strategies/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
Binary,
Date,
Time,
Null,
]
# Supported data type classes with arguments
_COMPLEX_DTYPES: list[DataTypeClass] = [
Expand Down
2 changes: 1 addition & 1 deletion py-polars/tests/unit/functions/range/test_date_range.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@


def test_date_range() -> None:
# if low/high are both date, range is also be date _iif_ the granularity is >= 1d
# if low/high are both date, range is also be date _iff_ the granularity is >= 1d
result = pl.date_range(date(2022, 1, 1), date(2022, 3, 1), "1mo", eager=True)
assert result.to_list() == [date(2022, 1, 1), date(2022, 2, 1), date(2022, 3, 1)]

Expand Down
17 changes: 15 additions & 2 deletions py-polars/tests/unit/interchange/test_roundtrip.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,12 @@
]


@given(dataframes(allowed_dtypes=protocol_dtypes))
@given(
dataframes(
allowed_dtypes=protocol_dtypes,
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
)
)
def test_to_dataframe_pyarrow_parametric(df: pl.DataFrame) -> None:
dfi = df.__dataframe__()
df_pa = pa.interchange.from_dataframe(dfi)
Expand Down Expand Up @@ -71,7 +76,12 @@ def test_to_dataframe_pyarrow_zero_copy_parametric(df: pl.DataFrame) -> None:
@pytest.mark.filterwarnings(
"ignore:.*PEP3118 format string that does not match its itemsize:RuntimeWarning"
)
@given(dataframes(allowed_dtypes=protocol_dtypes))
@given(
dataframes(
allowed_dtypes=protocol_dtypes,
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
)
)
def test_to_dataframe_pandas_parametric(df: pl.DataFrame) -> None:
dfi = df.__dataframe__()
df_pd = pd.api.interchange.from_dataframe(dfi)
Expand All @@ -94,6 +104,7 @@ def test_to_dataframe_pandas_parametric(df: pl.DataFrame) -> None:
pl.Categorical,
],
chunked=False,
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
)
)
def test_to_dataframe_pandas_zero_copy_parametric(df: pl.DataFrame) -> None:
Expand Down Expand Up @@ -193,6 +204,7 @@ def test_from_dataframe_pandas_zero_copy_parametric(df: pl.DataFrame) -> None:
# Empty string columns cause an error due to a bug in pandas.
# https://github.com/pandas-dev/pandas/issues/56703
min_size=1,
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
)
)
def test_from_dataframe_pandas_native_parametric(df: pl.DataFrame) -> None:
Expand All @@ -217,6 +229,7 @@ def test_from_dataframe_pandas_native_parametric(df: pl.DataFrame) -> None:
# https://github.com/pandas-dev/pandas/issues/56700
min_size=1,
chunked=False,
allow_null=False, # Bug: https://github.com/pola-rs/polars/issues/16190
)
)
def test_from_dataframe_pandas_native_zero_copy_parametric(df: pl.DataFrame) -> None:
Expand Down
17 changes: 13 additions & 4 deletions py-polars/tests/unit/interop/numpy/test_to_numpy_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,11 +326,19 @@ def test_series_to_numpy_temporal() -> None:

@given(
s=series(
min_size=1, max_size=10, excluded_dtypes=[pl.Categorical, pl.List, pl.Struct]
min_size=1,
max_size=10,
excluded_dtypes=[
pl.Categorical,
pl.List,
pl.Struct,
pl.Datetime("ms"),
pl.Duration("ms"),
],
allow_null=False,
).filter(
lambda s: (
getattr(s.dtype, "time_unit", None) != "ms"
and not (s.dtype == pl.String and s.str.contains("\x00").any())
not (s.dtype == pl.String and s.str.contains("\x00").any())
and not (s.dtype == pl.Binary and s.bin.contains(b"\x00").any())
)
),
Expand All @@ -345,8 +353,9 @@ def test_series_to_numpy(s: pl.Series) -> None:
pl.Datetime("us"): "datetime64[us]",
pl.Duration("ns"): "timedelta64[ns]",
pl.Duration("us"): "timedelta64[us]",
pl.Null(): "float32",
}
np_dtype = dtype_map.get(s.dtype) # type: ignore[call-overload]
np_dtype = dtype_map.get(s.dtype)
expected = np.array(values, dtype=np_dtype)

assert_array_equal(result, expected)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1350,7 +1350,7 @@ def test_series_duration_timeunits(


@given(
s=series(min_size=1, max_size=10, dtype=pl.Datetime),
s=series(min_size=1, max_size=10, dtype=pl.Datetime, allow_null=False),
)
def test_series_datetime_timeunits(
s: pl.Series,
Expand Down
1 change: 1 addition & 0 deletions py-polars/tests/unit/series/buffers/test_from_buffer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
s=series(
allowed_dtypes=(pl.INTEGER_DTYPES | pl.FLOAT_DTYPES | {pl.Boolean}),
chunked=False,
allow_null=False,
)
)
def test_series_from_buffer(s: pl.Series) -> None:
Expand Down
32 changes: 24 additions & 8 deletions py-polars/tests/unit/testing/parametric/strategies/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
def test_series_defaults(s: pl.Series) -> None:
assert isinstance(s, pl.Series)
assert s.name == ""
assert s.null_count() == 0


@given(s=series(name="hello"))
Expand All @@ -39,7 +38,7 @@ def test_series_dtype(data: st.DataObject) -> None:
assert s.dtype == dtype


@given(s=series(dtype=pl.Enum))
@given(s=series(dtype=pl.Enum, allow_null=False))
@settings(max_examples=5)
def test_series_dtype_enum(s: pl.Series) -> None:
assert isinstance(s.dtype, pl.Enum)
Expand All @@ -58,9 +57,21 @@ def test_series_size_range(s: pl.Series) -> None:
assert 3 <= s.len() <= 8


@given(s=series(allow_null=True))
def test_series_allow_null(s: pl.Series) -> None:
assert 0 <= s.null_count() <= s.len()
@given(s=series(allow_null=False))
def test_series_allow_null_false(s: pl.Series) -> None:
assert s.null_count() == 0
assert s.dtype != pl.Null


@given(s=series(allowed_dtypes=[pl.Null], allow_null=False))
def test_series_allow_null_allowed_dtypes(s: pl.Series) -> None:
assert s.dtype == pl.Null


@given(s=series(allowed_dtypes=[pl.List(pl.Int8)], allow_null=False))
def test_series_allow_null_nested(s: pl.Series) -> None:
for v in s:
assert v.null_count() == 0


@given(df=dataframes())
Expand Down Expand Up @@ -121,6 +132,7 @@ def test_dataframes_allow_null_override(df: pl.DataFrame) -> None:
# generate lazyframes with at least one row
lazy=True,
min_size=1,
allow_null=False,
# test mix & match of bulk-assigned cols with custom cols
cols=[column(n, dtype=pl.UInt8, unique=True) for n in ["a", "b"]],
include_cols=[
Expand Down Expand Up @@ -190,7 +202,8 @@ def test_allow_infinities_deprecated(data: st.DataObject) -> None:
min_len=1,
),
),
]
],
allow_null=False,
),
)
def test_dataframes_nested_strategies(df: pl.DataFrame) -> None:
Expand Down Expand Up @@ -255,9 +268,12 @@ def test_chunking(

@given(
df=dataframes(
allowed_dtypes=[pl.Float32, pl.Float64], max_cols=4, allow_infinity=False
allowed_dtypes=[pl.Float32, pl.Float64],
max_cols=4,
allow_null=False,
allow_infinity=False,
),
s=series(dtype=pl.Float64, allow_infinity=False),
s=series(dtype=pl.Float64, allow_null=False, allow_infinity=False),
)
def test_infinities(
df: pl.DataFrame,
Expand Down

0 comments on commit f6b4f48

Please sign in to comment.