Skip to content

Commit

Permalink
python: always use stdlib http reader and improve memmap ipc reader a…
Browse files Browse the repository at this point in the history
…rguments (pola-rs#4193)
  • Loading branch information
ritchie46 authored Jul 31, 2022
1 parent 1a57411 commit fad1c77
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 14 deletions.
7 changes: 5 additions & 2 deletions py-polars/polars/internals/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,15 @@ def managed_file(file: Any) -> Iterator[Any]:
if isinstance(file, Path):
return managed_file(format_path(file))
if isinstance(file, str):
# make sure that this is before fsspec
# as fsspec needs requests to be installed
# to read from http
if file.startswith("http"):
return _process_http_file(file)
if _WITH_FSSPEC:
if infer_storage_options(file)["protocol"] == "file":
return managed_file(format_path(file))
return fsspec.open(file, **kwargs)
if file.startswith("http"):
return _process_http_file(file)
if isinstance(file, list) and bool(file) and all(isinstance(f, str) for f in file):
if _WITH_FSSPEC:
if all(infer_storage_options(f)["protocol"] == "file" for f in file):
Expand Down
33 changes: 21 additions & 12 deletions py-polars/polars/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,13 +618,18 @@ def scan_ipc(
# Map legacy arguments to current ones and remove them from kwargs.
n_rows = kwargs.pop("stop_after_n_rows", n_rows)

if (
_PYARROW_AVAILABLE
and row_count_name is None
and row_count_offset is None
and "*" not in file
):
read_ipc(file=file, n_rows=n_rows, use_pyarrow=True, cache=False, rechunk=False)
if _PYARROW_AVAILABLE:
# we choose the read path as we can memory map the file
if isinstance(file, str) and "*" not in file or isinstance(file, Path):
return read_ipc(
file=file,
n_rows=n_rows,
use_pyarrow=True,
memory_map=True,
rechunk=False,
row_count_name=row_count_name,
row_count_offset=row_count_offset,
).lazy()

return LazyFrame.scan_ipc(
file=file,
Expand Down Expand Up @@ -787,12 +792,11 @@ def read_ipc(
columns = kwargs.pop("projection", None)

if use_pyarrow:
if row_count_name is not None:
if n_rows and not memory_map:
raise ValueError(
"``row_count_name`` cannot be used with ``use_pyarrow=True``."
"``n_rows`` cannot be used with ``use_pyarrow=True` "
"and memory_map=False`."
)
if n_rows:
raise ValueError("``n_rows`` cannot be used with ``use_pyarrow=True``.")

storage_options = storage_options or {}
with _prepare_file_arg(file, **storage_options) as data:
Expand All @@ -804,7 +808,12 @@ def read_ipc(
)

tbl = pa.feather.read_table(data, memory_map=memory_map, columns=columns)
return DataFrame._from_arrow(tbl, rechunk=rechunk)
df = DataFrame._from_arrow(tbl, rechunk=rechunk)
if row_count_name is not None:
df = df.with_row_count(row_count_name, row_count_offset)
if n_rows is not None:
df = df.slice(0, n_rows)
return df

return DataFrame._read_ipc(
data,
Expand Down

0 comments on commit fad1c77

Please sign in to comment.