Skip to content

Commit

Permalink
fix: Reading CSV with low_memory gave no data (pola-rs#16231)
Browse files Browse the repository at this point in the history
  • Loading branch information
nameexhaustion authored May 15, 2024
1 parent 755d40d commit 253fd24
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 24 deletions.
8 changes: 1 addition & 7 deletions crates/polars-pipe/src/executors/sources/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,6 @@ impl CsvSource {
eprintln!("STREAMING CHUNK SIZE: {chunk_size} rows")
}

let low_memory = options.low_memory;

let reader: CsvReader<File> = options
.with_n_rows(n_rows)
.with_columns(with_columns)
Expand All @@ -96,11 +94,7 @@ impl CsvSource {

// Safety: `reader` outlives `batched_reader`
let reader: &'static mut CsvReader<File> = unsafe { std::mem::transmute(reader) };
let batched_reader = if low_memory {
Either::Right(reader.batched_borrowed_read()?)
} else {
Either::Left(reader.batched_borrowed_mmap()?)
};
let batched_reader = Either::Left(reader.batched_borrowed_mmap()?);
self.batched_reader = Some(batched_reader);
Ok(())
}
Expand Down
19 changes: 5 additions & 14 deletions py-polars/src/batched_csv.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use std::path::PathBuf;
use std::sync::Mutex;

use polars::io::csv::read::{OwnedBatchedCsvReader, OwnedBatchedCsvReaderMmap};
use polars::io::csv::read::OwnedBatchedCsvReaderMmap;
use polars::io::mmap::MmapBytesReader;
use polars::io::RowIndex;
use polars::prelude::*;
Expand All @@ -12,7 +12,6 @@ use crate::{PyDataFrame, PyPolarsErr, Wrap};

enum BatchedReader {
MMap(OwnedBatchedCsvReaderMmap),
Read(OwnedBatchedCsvReader),
}

#[pyclass]
Expand Down Expand Up @@ -129,17 +128,10 @@ impl PyBatchedCsv {
)
.into_reader_with_file_handle(reader);

let reader = if low_memory {
let reader = reader
.batched_read(overwrite_dtype.map(Arc::new))
.map_err(PyPolarsErr::from)?;
BatchedReader::Read(reader)
} else {
let reader = reader
.batched_mmap(overwrite_dtype.map(Arc::new))
.map_err(PyPolarsErr::from)?;
BatchedReader::MMap(reader)
};
let reader = reader
.batched_mmap(overwrite_dtype.map(Arc::new))
.map_err(PyPolarsErr::from)?;
let reader = BatchedReader::MMap(reader);

Ok(PyBatchedCsv {
reader: Mutex::new(reader),
Expand All @@ -154,7 +146,6 @@ impl PyBatchedCsv {
.map_err(|e| PyPolarsErr::Other(e.to_string()))?;
match reader {
BatchedReader::MMap(reader) => reader.next_batches(n),
BatchedReader::Read(reader) => reader.next_batches(n),
}
.map_err(PyPolarsErr::from)
})?;
Expand Down
6 changes: 3 additions & 3 deletions py-polars/tests/unit/io/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -1444,10 +1444,10 @@ def test_batched_csv_reader(foods_file_path: Path) -> None:

# the final batch of the low-memory variant is different
reader = pl.read_csv_batched(foods_file_path, batch_size=4, low_memory=True)
batches = reader.next_batches(5)
assert len(batches) == 5 # type: ignore[arg-type]
batches = reader.next_batches(10)
assert batches is not None
assert len(batches) == 5

batches += reader.next_batches(5) # type: ignore[operator]
assert_frame_equal(pl.concat(batches), pl.read_csv(foods_file_path))

reader = pl.read_csv_batched(foods_file_path, batch_size=4, low_memory=True)
Expand Down

0 comments on commit 253fd24

Please sign in to comment.