From 253fd24fc4318cb79beb16e61523ac0c81308acb Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Wed, 15 May 2024 19:23:56 +1000 Subject: [PATCH] fix: Reading CSV with low_memory gave no data (#16231) --- .../polars-pipe/src/executors/sources/csv.rs | 8 +------- py-polars/src/batched_csv.rs | 19 +++++-------------- py-polars/tests/unit/io/test_csv.py | 6 +++--- 3 files changed, 9 insertions(+), 24 deletions(-) diff --git a/crates/polars-pipe/src/executors/sources/csv.rs b/crates/polars-pipe/src/executors/sources/csv.rs index 7391dd390d17..cdaecf32b02b 100644 --- a/crates/polars-pipe/src/executors/sources/csv.rs +++ b/crates/polars-pipe/src/executors/sources/csv.rs @@ -81,8 +81,6 @@ impl CsvSource { eprintln!("STREAMING CHUNK SIZE: {chunk_size} rows") } - let low_memory = options.low_memory; - let reader: CsvReader = options .with_n_rows(n_rows) .with_columns(with_columns) @@ -96,11 +94,7 @@ impl CsvSource { // Safety: `reader` outlives `batched_reader` let reader: &'static mut CsvReader = unsafe { std::mem::transmute(reader) }; - let batched_reader = if low_memory { - Either::Right(reader.batched_borrowed_read()?) - } else { - Either::Left(reader.batched_borrowed_mmap()?) - }; + let batched_reader = Either::Left(reader.batched_borrowed_mmap()?); self.batched_reader = Some(batched_reader); Ok(()) } diff --git a/py-polars/src/batched_csv.rs b/py-polars/src/batched_csv.rs index c547da5dd94c..db245c63dab9 100644 --- a/py-polars/src/batched_csv.rs +++ b/py-polars/src/batched_csv.rs @@ -1,7 +1,7 @@ use std::path::PathBuf; use std::sync::Mutex; -use polars::io::csv::read::{OwnedBatchedCsvReader, OwnedBatchedCsvReaderMmap}; +use polars::io::csv::read::OwnedBatchedCsvReaderMmap; use polars::io::mmap::MmapBytesReader; use polars::io::RowIndex; use polars::prelude::*; @@ -12,7 +12,6 @@ use crate::{PyDataFrame, PyPolarsErr, Wrap}; enum BatchedReader { MMap(OwnedBatchedCsvReaderMmap), - Read(OwnedBatchedCsvReader), } #[pyclass] @@ -129,17 +128,10 @@ impl PyBatchedCsv { ) .into_reader_with_file_handle(reader); - let reader = if low_memory { - let reader = reader - .batched_read(overwrite_dtype.map(Arc::new)) - .map_err(PyPolarsErr::from)?; - BatchedReader::Read(reader) - } else { - let reader = reader - .batched_mmap(overwrite_dtype.map(Arc::new)) - .map_err(PyPolarsErr::from)?; - BatchedReader::MMap(reader) - }; + let reader = reader + .batched_mmap(overwrite_dtype.map(Arc::new)) + .map_err(PyPolarsErr::from)?; + let reader = BatchedReader::MMap(reader); Ok(PyBatchedCsv { reader: Mutex::new(reader), @@ -154,7 +146,6 @@ impl PyBatchedCsv { .map_err(|e| PyPolarsErr::Other(e.to_string()))?; match reader { BatchedReader::MMap(reader) => reader.next_batches(n), - BatchedReader::Read(reader) => reader.next_batches(n), } .map_err(PyPolarsErr::from) })?; diff --git a/py-polars/tests/unit/io/test_csv.py b/py-polars/tests/unit/io/test_csv.py index e19d31ca2e29..d92deefa4ce2 100644 --- a/py-polars/tests/unit/io/test_csv.py +++ b/py-polars/tests/unit/io/test_csv.py @@ -1444,10 +1444,10 @@ def test_batched_csv_reader(foods_file_path: Path) -> None: # the final batch of the low-memory variant is different reader = pl.read_csv_batched(foods_file_path, batch_size=4, low_memory=True) - batches = reader.next_batches(5) - assert len(batches) == 5 # type: ignore[arg-type] + batches = reader.next_batches(10) + assert batches is not None + assert len(batches) == 5 - batches += reader.next_batches(5) # type: ignore[operator] assert_frame_equal(pl.concat(batches), pl.read_csv(foods_file_path)) reader = pl.read_csv_batched(foods_file_path, batch_size=4, low_memory=True)