feat(python): Enable Null datatype and null values by default in para…

…metric testing (pola-rs#16192)
orlp · May 13, 2024 · f6b4f48 · f6b4f48
1 parent 8066952
commit f6b4f48
Show file tree

Hide file tree

Showing 11 changed files with 87 additions and 43 deletions.
diff --git a/_typos.toml b/_typos.toml
@@ -4,19 +4,17 @@ extend-ignore-identifiers-re = [
 ]
 
 [default.extend-identifiers]
-arange = "arange"
 bck = "bck"
 Fo = "Fo"
 ND = "ND"
 ba = "ba"
 nd = "nd"
 opt_nd = "opt_nd"
 ser = "ser"
-strat = "strat"
-width_strat = "width_strat"
 
 [default.extend-words]
-iif = "iif"
+arange = "arange"
+strat = "strat"
 '"r0ot"' = "r0ot"
 wee = "wee"
 

diff --git a/py-polars/polars/series/utils.py b/py-polars/polars/series/utils.py
@@ -32,7 +32,7 @@ def expr_dispatch(cls: type[T]) -> type[T]:
     * Applied to the Series class, and/or any Series 'NameSpace' classes.
     * Walks the class attributes, looking for methods that have empty function
       bodies, with signatures compatible with an existing Expr function.
-    * IIF both conditions are met, the empty method is decorated with @call_expr.
+    * IFF both conditions are met, the empty method is decorated with @call_expr.
     """
     # create lookup of expression functions in this namespace
     namespace = getattr(cls, "_accessor", None)

diff --git a/py-polars/polars/testing/parametric/strategies/core.py b/py-polars/polars/testing/parametric/strategies/core.py
@@ -8,7 +8,7 @@
 
 from polars._utils.deprecation import issue_deprecation_warning
 from polars.dataframe import DataFrame
-from polars.datatypes import DataType, DataTypeClass
+from polars.datatypes import DataType, DataTypeClass, Null
 from polars.series import Series
 from polars.string_cache import StringCache
 from polars.testing.parametric.strategies._utils import flexhash
@@ -39,7 +39,7 @@ def series(  # noqa: D417
     min_size: int = 0,
     max_size: int = _ROW_LIMIT,
     strategy: SearchStrategy[Any] | None = None,
-    allow_null: bool = False,
+    allow_null: bool = True,
     unique: bool = False,
     chunked: bool | None = None,
     allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
@@ -68,7 +68,7 @@ def series(  # noqa: D417
     strategy : strategy, optional
         supports overriding the default strategy for the given dtype.
     allow_null : bool
-        Allow nulls as possible values.
+        Allow nulls as possible values and allow the `Null` data type by default.
     unique : bool, optional
         indicate whether Series values should all be distinct.
     chunked : bool, optional
@@ -144,22 +144,28 @@ def series(  # noqa: D417
         allowed_dtypes = list(allowed_dtypes)
     if isinstance(excluded_dtypes, (DataType, DataTypeClass)):
         excluded_dtypes = [excluded_dtypes]
-    elif excluded_dtypes is not None and not isinstance(excluded_dtypes, Sequence):
-        excluded_dtypes = list(excluded_dtypes)
+    elif excluded_dtypes is not None:
+        if not isinstance(excluded_dtypes, list):
+            excluded_dtypes = list(excluded_dtypes)
+
+    if not allow_null and not (allowed_dtypes is not None and Null in allowed_dtypes):
+        if excluded_dtypes is None:
+            excluded_dtypes = [Null]
+        else:
+            excluded_dtypes.append(Null)
 
     if strategy is None:
         if dtype is None:
-            dtype = draw(
-                dtypes(allowed_dtypes=allowed_dtypes, excluded_dtypes=excluded_dtypes)
+            dtype_strat = dtypes(
+                allowed_dtypes=allowed_dtypes, excluded_dtypes=excluded_dtypes
             )
         else:
-            dtype = draw(
-                _instantiate_dtype(
-                    dtype,
-                    allowed_dtypes=allowed_dtypes,
-                    excluded_dtypes=excluded_dtypes,
-                )
+            dtype_strat = _instantiate_dtype(
+                dtype,
+                allowed_dtypes=allowed_dtypes,
+                excluded_dtypes=excluded_dtypes,
             )
+        dtype = draw(dtype_strat)
 
     if size is None:
         size = draw(st.integers(min_value=min_size, max_value=max_size))
@@ -213,7 +219,7 @@ def dataframes(
     max_size: int = _ROW_LIMIT,
     chunked: bool | None = None,
     include_cols: Sequence[column] | column | None = None,
-    allow_null: bool | Mapping[str, bool] = False,
+    allow_null: bool | Mapping[str, bool] = True,
     allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
     excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
     **kwargs: Any,
@@ -232,7 +238,7 @@ def dataframes(
     max_size: int = _ROW_LIMIT,
     chunked: bool | None = None,
     include_cols: Sequence[column] | column | None = None,
-    allow_null: bool | Mapping[str, bool] = False,
+    allow_null: bool | Mapping[str, bool] = True,
     allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
     excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
     **kwargs: Any,
@@ -253,7 +259,7 @@ def dataframes(  # noqa: D417
     max_size: int = _ROW_LIMIT,
     chunked: bool | None = None,
     include_cols: Sequence[column] | column | None = None,
-    allow_null: bool | Mapping[str, bool] = False,
+    allow_null: bool | Mapping[str, bool] = True,
     allowed_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
     excluded_dtypes: Collection[PolarsDataType] | PolarsDataType | None = None,
     **kwargs: Any,
@@ -290,7 +296,8 @@ def dataframes(  # noqa: D417
         explicitly provided columns are appended onto the list of existing columns
         (if any present).
     allow_null : bool or Mapping[str, bool]
-        Allow nulls as possible values.
+        Allow nulls as possible values and allow the `Null` data type by default.
+        Accepts either a boolean or a mapping of column names to booleans.
     allowed_dtypes : {list,set}, optional
         when automatically generating data, allow only these dtypes.
     excluded_dtypes : {list,set}, optional
@@ -404,12 +411,10 @@ def dataframes(  # noqa: D417
             c.name = f"col{idx}"
         if c.allow_null is None:
             if isinstance(allow_null, Mapping):
-                c.allow_null = allow_null.get(c.name, False)
+                c.allow_null = allow_null.get(c.name, True)
             else:
                 c.allow_null = allow_null
 
-    # init dataframe from generated series data; series data is
-    # given as a python-native sequence.
     with StringCache():
         data = {
             c.name: draw(
@@ -456,7 +461,7 @@ class column:
     strategy : strategy, optional
         supports overriding the default strategy for the given dtype.
     allow_null : bool, optional
-        Allow nulls as possible values.
+        Allow nulls as possible values and allow the `Null` data type by default.
     unique : bool, optional
         flag indicating that all values generated for the column should be unique.
 

diff --git a/py-polars/polars/testing/parametric/strategies/data.py b/py-polars/polars/testing/parametric/strategies/data.py
@@ -349,14 +349,15 @@ def data(
         )
     elif dtype == List:
         inner = getattr(dtype, "inner", None) or Null()
-        strategy = lists(inner, **kwargs)
+        strategy = lists(inner, allow_null=allow_null, **kwargs)
     elif dtype == Array:
         inner = getattr(dtype, "inner", None) or Null()
         width = getattr(dtype, "width", _DEFAULT_ARRAY_WIDTH_LIMIT)
         strategy = lists(
             inner,
             min_len=width,
             max_len=width,
+            allow_null=allow_null,
             **kwargs,
         )
     else:

diff --git a/py-polars/polars/testing/parametric/strategies/dtype.py b/py-polars/polars/testing/parametric/strategies/dtype.py
@@ -57,6 +57,7 @@
     Binary,
     Date,
     Time,
+    Null,
 ]
 # Supported data type classes with arguments
 _COMPLEX_DTYPES: list[DataTypeClass] = [

diff --git a/py-polars/tests/unit/functions/range/test_date_range.py b/py-polars/tests/unit/functions/range/test_date_range.py
@@ -14,7 +14,7 @@
 
 
 def test_date_range() -> None:
-    # if low/high are both date, range is also be date _iif_ the granularity is >= 1d
+    # if low/high are both date, range is also be date _iff_ the granularity is >= 1d
     result = pl.date_range(date(2022, 1, 1), date(2022, 3, 1), "1mo", eager=True)
     assert result.to_list() == [date(2022, 1, 1), date(2022, 2, 1), date(2022, 3, 1)]
 

diff --git a/py-polars/tests/unit/interchange/test_roundtrip.py b/py-polars/tests/unit/interchange/test_roundtrip.py
@@ -36,7 +36,12 @@
 ]
 
 
-@given(dataframes(allowed_dtypes=protocol_dtypes))
+@given(
+    dataframes(
+        allowed_dtypes=protocol_dtypes,
+        allow_null=False,  # Bug: https://github.com/pola-rs/polars/issues/16190
+    )
+)
 def test_to_dataframe_pyarrow_parametric(df: pl.DataFrame) -> None:
     dfi = df.__dataframe__()
     df_pa = pa.interchange.from_dataframe(dfi)
@@ -71,7 +76,12 @@ def test_to_dataframe_pyarrow_zero_copy_parametric(df: pl.DataFrame) -> None:
 @pytest.mark.filterwarnings(
     "ignore:.*PEP3118 format string that does not match its itemsize:RuntimeWarning"
 )
-@given(dataframes(allowed_dtypes=protocol_dtypes))
+@given(
+    dataframes(
+        allowed_dtypes=protocol_dtypes,
+        allow_null=False,  # Bug: https://github.com/pola-rs/polars/issues/16190
+    )
+)
 def test_to_dataframe_pandas_parametric(df: pl.DataFrame) -> None:
     dfi = df.__dataframe__()
     df_pd = pd.api.interchange.from_dataframe(dfi)
@@ -94,6 +104,7 @@ def test_to_dataframe_pandas_parametric(df: pl.DataFrame) -> None:
             pl.Categorical,
         ],
         chunked=False,
+        allow_null=False,  # Bug: https://github.com/pola-rs/polars/issues/16190
     )
 )
 def test_to_dataframe_pandas_zero_copy_parametric(df: pl.DataFrame) -> None:
@@ -193,6 +204,7 @@ def test_from_dataframe_pandas_zero_copy_parametric(df: pl.DataFrame) -> None:
         # Empty string columns cause an error due to a bug in pandas.
         # https://github.com/pandas-dev/pandas/issues/56703
         min_size=1,
+        allow_null=False,  # Bug: https://github.com/pola-rs/polars/issues/16190
     )
 )
 def test_from_dataframe_pandas_native_parametric(df: pl.DataFrame) -> None:
@@ -217,6 +229,7 @@ def test_from_dataframe_pandas_native_parametric(df: pl.DataFrame) -> None:
         # https://github.com/pandas-dev/pandas/issues/56700
         min_size=1,
         chunked=False,
+        allow_null=False,  # Bug: https://github.com/pola-rs/polars/issues/16190
     )
 )
 def test_from_dataframe_pandas_native_zero_copy_parametric(df: pl.DataFrame) -> None:

diff --git a/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py b/py-polars/tests/unit/interop/numpy/test_to_numpy_series.py
@@ -326,11 +326,19 @@ def test_series_to_numpy_temporal() -> None:
 
 @given(
     s=series(
-        min_size=1, max_size=10, excluded_dtypes=[pl.Categorical, pl.List, pl.Struct]
+        min_size=1,
+        max_size=10,
+        excluded_dtypes=[
+            pl.Categorical,
+            pl.List,
+            pl.Struct,
+            pl.Datetime("ms"),
+            pl.Duration("ms"),
+        ],
+        allow_null=False,
     ).filter(
         lambda s: (
-            getattr(s.dtype, "time_unit", None) != "ms"
-            and not (s.dtype == pl.String and s.str.contains("\x00").any())
+            not (s.dtype == pl.String and s.str.contains("\x00").any())
             and not (s.dtype == pl.Binary and s.bin.contains(b"\x00").any())
         )
     ),
@@ -345,8 +353,9 @@ def test_series_to_numpy(s: pl.Series) -> None:
         pl.Datetime("us"): "datetime64[us]",
         pl.Duration("ns"): "timedelta64[ns]",
         pl.Duration("us"): "timedelta64[us]",
+        pl.Null(): "float32",
     }
-    np_dtype = dtype_map.get(s.dtype)  # type: ignore[call-overload]
+    np_dtype = dtype_map.get(s.dtype)
     expected = np.array(values, dtype=np_dtype)
 
     assert_array_equal(result, expected)

diff --git a/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py b/py-polars/tests/unit/operations/namespaces/temporal/test_datetime.py
@@ -1350,7 +1350,7 @@ def test_series_duration_timeunits(
 
 
 @given(
-    s=series(min_size=1, max_size=10, dtype=pl.Datetime),
+    s=series(min_size=1, max_size=10, dtype=pl.Datetime, allow_null=False),
 )
 def test_series_datetime_timeunits(
     s: pl.Series,

diff --git a/py-polars/tests/unit/series/buffers/test_from_buffer.py b/py-polars/tests/unit/series/buffers/test_from_buffer.py
@@ -14,6 +14,7 @@
     s=series(
         allowed_dtypes=(pl.INTEGER_DTYPES | pl.FLOAT_DTYPES | {pl.Boolean}),
         chunked=False,
+        allow_null=False,
     )
 )
 def test_series_from_buffer(s: pl.Series) -> None:

diff --git a/py-polars/tests/unit/testing/parametric/strategies/test_core.py b/py-polars/tests/unit/testing/parametric/strategies/test_core.py
@@ -23,7 +23,6 @@
 def test_series_defaults(s: pl.Series) -> None:
     assert isinstance(s, pl.Series)
     assert s.name == ""
-    assert s.null_count() == 0
 
 
 @given(s=series(name="hello"))
@@ -39,7 +38,7 @@ def test_series_dtype(data: st.DataObject) -> None:
     assert s.dtype == dtype
 
 
-@given(s=series(dtype=pl.Enum))
+@given(s=series(dtype=pl.Enum, allow_null=False))
 @settings(max_examples=5)
 def test_series_dtype_enum(s: pl.Series) -> None:
     assert isinstance(s.dtype, pl.Enum)
@@ -58,9 +57,21 @@ def test_series_size_range(s: pl.Series) -> None:
     assert 3 <= s.len() <= 8
 
 
-@given(s=series(allow_null=True))
-def test_series_allow_null(s: pl.Series) -> None:
-    assert 0 <= s.null_count() <= s.len()
+@given(s=series(allow_null=False))
+def test_series_allow_null_false(s: pl.Series) -> None:
+    assert s.null_count() == 0
+    assert s.dtype != pl.Null
+
+
+@given(s=series(allowed_dtypes=[pl.Null], allow_null=False))
+def test_series_allow_null_allowed_dtypes(s: pl.Series) -> None:
+    assert s.dtype == pl.Null
+
+
+@given(s=series(allowed_dtypes=[pl.List(pl.Int8)], allow_null=False))
+def test_series_allow_null_nested(s: pl.Series) -> None:
+    for v in s:
+        assert v.null_count() == 0
 
 
 @given(df=dataframes())
@@ -121,6 +132,7 @@ def test_dataframes_allow_null_override(df: pl.DataFrame) -> None:
         # generate lazyframes with at least one row
         lazy=True,
         min_size=1,
+        allow_null=False,
         # test mix & match of bulk-assigned cols with custom cols
         cols=[column(n, dtype=pl.UInt8, unique=True) for n in ["a", "b"]],
         include_cols=[
@@ -190,7 +202,8 @@ def test_allow_infinities_deprecated(data: st.DataObject) -> None:
                     min_len=1,
                 ),
             ),
-        ]
+        ],
+        allow_null=False,
     ),
 )
 def test_dataframes_nested_strategies(df: pl.DataFrame) -> None:
@@ -255,9 +268,12 @@ def test_chunking(
 
 @given(
     df=dataframes(
-        allowed_dtypes=[pl.Float32, pl.Float64], max_cols=4, allow_infinity=False
+        allowed_dtypes=[pl.Float32, pl.Float64],
+        max_cols=4,
+        allow_null=False,
+        allow_infinity=False,
     ),
-    s=series(dtype=pl.Float64, allow_infinity=False),
+    s=series(dtype=pl.Float64, allow_null=False, allow_infinity=False),
 )
 def test_infinities(
     df: pl.DataFrame,
-Original file line number
+Diff line change
@@ Expand Up / @@ -57,6 +57,7 @@ @@
         Binary,
         Date,
         Time,
+        Null,
     ]
     # Supported data type classes with arguments
     _COMPLEX_DTYPES: list[DataTypeClass] = [
@@ Expand Down @@