From 59536fc6212dcbc60bfa13ca7d5fe8ec948daff4 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Wed, 1 Mar 2023 14:06:44 +0100 Subject: [PATCH] perf(rust, python): use inlined strings for field and schema (#7272) --- Cargo.toml | 1 + polars/polars-core/Cargo.toml | 4 +- .../src/chunked_array/logical/struct_/mod.rs | 6 ++- .../src/chunked_array/ops/sort/mod.rs | 1 + polars/polars-core/src/datatypes/field.rs | 21 +++++----- polars/polars-core/src/datatypes/mod.rs | 4 +- .../polars-core/src/frame/asof_join/groups.rs | 15 +++----- polars/polars-core/src/frame/asof_join/mod.rs | 8 ++-- polars/polars-core/src/frame/explode.rs | 19 +++++----- polars/polars-core/src/schema.rs | 27 ++++++------- polars/polars-core/src/utils/mod.rs | 11 ++++++ polars/polars-io/src/ndjson_core/buffer.rs | 2 +- polars/polars-io/src/parquet/predicates.rs | 2 +- polars/polars-lazy/Cargo.toml | 1 + polars/polars-lazy/polars-plan/Cargo.toml | 1 + .../polars-lazy/polars-plan/src/dsl/string.rs | 13 +++++-- .../polars-plan/src/logical_plan/builder.rs | 8 ++-- .../src/logical_plan/functions/drop.rs | 4 +- .../src/logical_plan/functions/mod.rs | 8 ++-- .../src/logical_plan/functions/rename.rs | 8 ++-- .../optimizer/predicate_pushdown/rename.rs | 6 ++- .../optimizer/projection_pushdown/rename.rs | 6 ++- .../polars-plan/src/logical_plan/schema.rs | 10 +++-- polars/polars-lazy/polars-plan/src/utils.rs | 3 +- polars/polars-lazy/src/frame/mod.rs | 19 +++++----- .../src/physical_plan/executors/scan/csv.rs | 4 +- .../src/physical_plan/executors/scan/ipc.rs | 4 +- .../physical_plan/executors/scan/parquet.rs | 4 +- .../src/physical_plan/expressions/window.rs | 3 +- polars/polars-lazy/src/tests/queries.rs | 4 +- polars/polars-ops/Cargo.toml | 1 + .../src/chunked_array/list/to_struct.rs | 8 ++-- polars/polars-time/Cargo.toml | 1 + polars/polars-time/src/groupby/dynamic.rs | 7 ++-- polars/polars-utils/Cargo.toml | 1 + polars/polars-utils/src/fmt.rs | 11 ++++++ polars/polars-utils/src/lib.rs | 1 + polars/tests/it/io/csv.rs | 2 +- polars/tests/it/io/json.rs | 8 ++-- polars/tests/it/schema.rs | 2 +- py-polars/Cargo.lock | 7 ++++ py-polars/Cargo.toml | 1 + py-polars/src/conversion.rs | 13 ++++++- py-polars/src/dataframe.rs | 18 ++++----- py-polars/src/lazy/dataframe.rs | 38 ++++++++++--------- py-polars/src/lazy/dsl.rs | 4 +- 46 files changed, 209 insertions(+), 141 deletions(-) create mode 100644 polars/polars-utils/src/fmt.rs diff --git a/Cargo.toml b/Cargo.toml index eac429e728d9..974ea3f857e0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,6 +34,7 @@ hashbrown = { version = "0.13.1", features = ["rayon", "ahash"] } bitflags = "1.3" once_cell = "1" memchr = "2" +smartstring = { version = "1" } [workspace.dependencies.arrow] package = "arrow2" diff --git a/polars/polars-core/Cargo.toml b/polars/polars-core/Cargo.toml index 0dc0a0eb79e8..bff2bdd24c3d 100644 --- a/polars/polars-core/Cargo.toml +++ b/polars/polars-core/Cargo.toml @@ -100,7 +100,7 @@ parquet = ["arrow/io_parquet"] bigidx = ["polars-arrow/bigidx"] python = [] -serde-lazy = ["serde", "polars-arrow/serde", "indexmap/serde"] +serde-lazy = ["serde", "polars-arrow/serde", "indexmap/serde", "smartstring/serde"] docs-selection = [ "ndarray", @@ -174,7 +174,7 @@ regex = { version = "1.6", optional = true } # activate if you want serde support for Series and DataFrames serde = { version = "1", features = ["derive"], optional = true } serde_json = { version = "1", optional = true } -smartstring = { version = "1" } +smartstring.workspace = true thiserror.workspace = true url = { version = "2.3.1", optional = true } xxhash-rust.workspace = true diff --git a/polars/polars-core/src/chunked_array/logical/struct_/mod.rs b/polars/polars-core/src/chunked_array/logical/struct_/mod.rs index 1fa7c6363a86..0b81252b4b75 100644 --- a/polars/polars-core/src/chunked_array/logical/struct_/mod.rs +++ b/polars/polars-core/src/chunked_array/logical/struct_/mod.rs @@ -2,6 +2,8 @@ mod from; use std::collections::BTreeMap; +use smartstring::alias::String as SmartString; + use super::*; use crate::datatypes::*; use crate::utils::index_to_chunked_index2; @@ -163,7 +165,7 @@ impl StructChunked { &self.field } - pub fn name(&self) -> &String { + pub fn name(&self) -> &SmartString { self.field.name() } @@ -176,7 +178,7 @@ impl StructChunked { } pub fn rename(&mut self, name: &str) { - self.field.set_name(name.to_string()) + self.field.set_name(name.into()) } pub(crate) fn try_apply_fields(&self, func: F) -> PolarsResult diff --git a/polars/polars-core/src/chunked_array/ops/sort/mod.rs b/polars/polars-core/src/chunked_array/ops/sort/mod.rs index fbeb7f27195d..dc9fc2c5223e 100644 --- a/polars/polars-core/src/chunked_array/ops/sort/mod.rs +++ b/polars/polars-core/src/chunked_array/ops/sort/mod.rs @@ -8,6 +8,7 @@ use std::cmp::Ordering; use std::hint::unreachable_unchecked; use std::iter::FromIterator; +#[cfg(feature = "sort_multiple")] pub(crate) use arg_sort_multiple::argsort_multiple_row_fmt; use arrow::bitmap::MutableBitmap; use arrow::buffer::Buffer; diff --git a/polars/polars-core/src/datatypes/field.rs b/polars/polars-core/src/datatypes/field.rs index 44d9d5f3d635..7eab98feae8b 100644 --- a/polars/polars-core/src/datatypes/field.rs +++ b/polars/polars-core/src/datatypes/field.rs @@ -1,13 +1,12 @@ +use smartstring::alias::String as SmartString; + use super::*; /// Characterizes the name and the [`DataType`] of a column. #[derive(Clone, Debug, PartialEq, Eq)] -#[cfg_attr( - any(feature = "serde", feature = "serde-lazy"), - derive(Serialize, Deserialize) -)] +#[cfg_attr(feature = "serde-lazy", derive(Serialize, Deserialize))] pub struct Field { - pub name: String, + pub name: SmartString, pub dtype: DataType, } @@ -25,12 +24,12 @@ impl Field { #[inline] pub fn new(name: &str, dtype: DataType) -> Self { Field { - name: name.to_string(), + name: name.into(), dtype, } } - pub fn from_owned(name: String, dtype: DataType) -> Self { + pub fn from_owned(name: SmartString, dtype: DataType) -> Self { Field { name, dtype } } @@ -45,7 +44,7 @@ impl Field { /// assert_eq!(f.name(), "Year"); /// ``` #[inline] - pub fn name(&self) -> &String { + pub fn name(&self) -> &SmartString { &self.name } @@ -86,11 +85,11 @@ impl Field { /// ```rust /// # use polars_core::prelude::*; /// let mut f = Field::new("Atomic number", DataType::UInt32); - /// f.set_name("Proton".to_owned()); + /// f.set_name("Proton".into()); /// /// assert_eq!(f, Field::new("Proton", DataType::UInt32)); /// ``` - pub fn set_name(&mut self, name: String) { + pub fn set_name(&mut self, name: SmartString) { self.name = name; } @@ -106,7 +105,7 @@ impl Field { /// assert_eq!(f.to_arrow(), af); /// ``` pub fn to_arrow(&self) -> ArrowField { - ArrowField::new(&self.name, self.dtype.to_arrow(), true) + ArrowField::new(self.name.as_str(), self.dtype.to_arrow(), true) } } diff --git a/polars/polars-core/src/datatypes/mod.rs b/polars/polars-core/src/datatypes/mod.rs index 72a8b7939072..2b3012f076a1 100644 --- a/polars/polars-core/src/datatypes/mod.rs +++ b/polars/polars-core/src/datatypes/mod.rs @@ -34,9 +34,9 @@ use num_traits::{Bounded, FromPrimitive, Num, NumCast, Zero}; use polars_arrow::data_types::IsFloat; #[cfg(feature = "serde")] use serde::de::{EnumAccess, Error, Unexpected, VariantAccess, Visitor}; -#[cfg(feature = "serde")] +#[cfg(any(feature = "serde", feature = "serde-lazy"))] use serde::{Deserialize, Serialize}; -#[cfg(feature = "serde")] +#[cfg(any(feature = "serde", feature = "serde-lazy"))] use serde::{Deserializer, Serializer}; pub use time_unit::*; diff --git a/polars/polars-core/src/frame/asof_join/groups.rs b/polars/polars-core/src/frame/asof_join/groups.rs index 77f6bd2d16ff..88f74474d641 100644 --- a/polars/polars-core/src/frame/asof_join/groups.rs +++ b/polars/polars-core/src/frame/asof_join/groups.rs @@ -6,6 +6,7 @@ use ahash::RandomState; use arrow::types::NativeType; use num_traits::Zero; use rayon::prelude::*; +use smartstring::alias::String as SmartString; use super::*; use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE; @@ -634,8 +635,8 @@ impl DataFrame { other: &DataFrame, left_on: &str, right_on: &str, - left_by: Vec, - right_by: Vec, + left_by: Vec, + right_by: Vec, strategy: AsofStrategy, tolerance: Option>, suffix: Option<&str>, @@ -727,14 +728,8 @@ impl DataFrame { I: IntoIterator, S: AsRef, { - let left_by = left_by - .into_iter() - .map(|s| s.as_ref().to_string()) - .collect(); - let right_by = right_by - .into_iter() - .map(|s| s.as_ref().to_string()) - .collect(); + let left_by = left_by.into_iter().map(|s| s.as_ref().into()).collect(); + let right_by = right_by.into_iter().map(|s| s.as_ref().into()).collect(); self._join_asof_by( other, left_on, right_on, left_by, right_by, strategy, tolerance, None, None, ) diff --git a/polars/polars-core/src/frame/asof_join/mod.rs b/polars/polars-core/src/frame/asof_join/mod.rs index 030bc038690a..71a93b36bf1d 100644 --- a/polars/polars-core/src/frame/asof_join/mod.rs +++ b/polars/polars-core/src/frame/asof_join/mod.rs @@ -1,12 +1,12 @@ mod asof; mod groups; - use std::borrow::Cow; use asof::*; use num_traits::Bounded; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +use smartstring::alias::String as SmartString; use crate::prelude::*; use crate::utils::slice_slice; @@ -22,9 +22,9 @@ pub struct AsOfOptions { /// - "2h15m" /// - "1d6h" /// etc - pub tolerance_str: Option, - pub left_by: Option>, - pub right_by: Option>, + pub tolerance_str: Option, + pub left_by: Option>, + pub right_by: Option>, } fn check_asof_columns(a: &Series, b: &Series) -> PolarsResult<()> { diff --git a/polars/polars-core/src/frame/explode.rs b/polars/polars-core/src/frame/explode.rs index 1972a7cf581e..b64b3718bb6e 100644 --- a/polars/polars-core/src/frame/explode.rs +++ b/polars/polars-core/src/frame/explode.rs @@ -1,7 +1,8 @@ use arrow::offset::OffsetsBuffer; use polars_arrow::kernels::concatenate::concatenate_owned_unchecked; -#[cfg(feature = "serde")] +#[cfg(feature = "serde-lazy")] use serde::{Deserialize, Serialize}; +use smartstring::alias::String as SmartString; use crate::chunked_array::ops::explode::offsets_to_indexes; use crate::prelude::*; @@ -20,12 +21,12 @@ fn get_exploded(series: &Series) -> PolarsResult<(Series, OffsetsBuffer)> { /// Arguments for `[DataFrame::melt]` function #[derive(Clone, Default, Debug)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "serde-lazy", derive(Serialize, Deserialize))] pub struct MeltArgs { - pub id_vars: Vec, - pub value_vars: Vec, - pub variable_name: Option, - pub value_name: Option, + pub id_vars: Vec, + pub value_vars: Vec, + pub variable_name: Option, + pub value_name: Option, } impl DataFrame { @@ -209,8 +210,8 @@ impl DataFrame { /// ``` pub fn melt(&self, id_vars: I, value_vars: J) -> PolarsResult where - I: IntoVec, - J: IntoVec, + I: IntoVec, + J: IntoVec, { let id_vars = id_vars.into_vec(); let value_vars = value_vars.into_vec(); @@ -242,7 +243,7 @@ impl DataFrame { if id_vars_set.contains(s.name()) { None } else { - Some(s.name().to_string()) + Some(s.name().into()) } }) .collect(); diff --git a/polars/polars-core/src/schema.rs b/polars/polars-core/src/schema.rs index 33b25360adcf..eb3ecdb5d82b 100644 --- a/polars/polars-core/src/schema.rs +++ b/polars/polars-core/src/schema.rs @@ -3,13 +3,14 @@ use std::fmt::{Debug, Formatter}; use indexmap::IndexMap; #[cfg(feature = "serde-lazy")] use serde::{Deserialize, Serialize}; +use smartstring::alias::String as SmartString; use crate::prelude::*; #[derive(Eq, Clone, Default)] #[cfg_attr(feature = "serde-lazy", derive(Serialize, Deserialize))] pub struct Schema { - inner: PlIndexMap, + inner: PlIndexMap, } // IndexMap does not care about order. @@ -90,7 +91,7 @@ impl Schema { self.inner.is_empty() } - pub fn rename(&mut self, old: &str, new: String) -> Option<()> { + pub fn rename(&mut self, old: &str, new: SmartString) -> Option<()> { // we first append the new name // and then remove the old name // this works because the removed slot is swapped with the last value in the indexmap @@ -100,7 +101,7 @@ impl Schema { Some(()) } - pub fn insert_index(&self, index: usize, name: String, dtype: DataType) -> Option { + pub fn insert_index(&self, index: usize, name: SmartString, dtype: DataType) -> Option { // 0 and self.len() 0 is allowed if index > self.len() { return None; @@ -125,7 +126,7 @@ impl Schema { .ok_or_else(|| PolarsError::SchemaFieldNotFound(name.to_string().into())) } - pub fn try_get_full(&self, name: &str) -> PolarsResult<(usize, &String, &DataType)> { + pub fn try_get_full(&self, name: &str) -> PolarsResult<(usize, &SmartString, &DataType)> { self.inner .get_full(name) .ok_or_else(|| PolarsError::SchemaFieldNotFound(name.to_string().into())) @@ -135,7 +136,7 @@ impl Schema { self.inner.remove(name) } - pub fn get_full(&self, name: &str) -> Option<(usize, &String, &DataType)> { + pub fn get_full(&self, name: &str) -> Option<(usize, &SmartString, &DataType)> { self.inner.get_full(name) } @@ -152,7 +153,7 @@ impl Schema { .map(|dtype| Field::new(name, dtype.clone())) } - pub fn get_index(&self, index: usize) -> Option<(&String, &DataType)> { + pub fn get_index(&self, index: usize) -> Option<(&SmartString, &DataType)> { self.inner.get_index(index) } @@ -160,7 +161,7 @@ impl Schema { self.get(name).is_some() } - pub fn get_index_mut(&mut self, index: usize) -> Option<(&mut String, &mut DataType)> { + pub fn get_index_mut(&mut self, index: usize) -> Option<(&mut SmartString, &mut DataType)> { self.inner.get_index_mut(index) } @@ -184,7 +185,7 @@ impl Schema { /// inserted, last in order, and `None` is returned. /// /// Computes in **O(1)** time (amortized average). - pub fn with_column(&mut self, name: String, dtype: DataType) -> Option { + pub fn with_column(&mut self, name: SmartString, dtype: DataType) -> Option { self.inner.insert(name, dtype) } @@ -196,7 +197,7 @@ impl Schema { let fields: Vec<_> = self .inner .iter() - .map(|(name, dtype)| ArrowField::new(name, dtype.to_arrow(), true)) + .map(|(name, dtype)| ArrowField::new(name.as_str(), dtype.to_arrow(), true)) .collect(); ArrowSchema::from(fields) } @@ -211,10 +212,10 @@ impl Schema { self.inner.iter().map(|(_name, dtype)| dtype) } - pub fn iter_names(&self) -> impl Iterator + '_ + ExactSizeIterator { + pub fn iter_names(&self) -> impl Iterator + '_ + ExactSizeIterator { self.inner.iter().map(|(name, _dtype)| name) } - pub fn iter(&self) -> impl Iterator + '_ { + pub fn iter(&self) -> impl Iterator + '_ { self.inner.iter() } } @@ -222,8 +223,8 @@ impl Schema { pub type SchemaRef = Arc; impl IntoIterator for Schema { - type Item = (String, DataType); - type IntoIter = as IntoIterator>::IntoIter; + type Item = (SmartString, DataType); + type IntoIter = as IntoIterator>::IntoIter; fn into_iter(self) -> Self::IntoIter { self.inner.into_iter() diff --git a/polars/polars-core/src/utils/mod.rs b/polars/polars-core/src/utils/mod.rs index f31a1a2d0a6e..0fbe876eaeac 100644 --- a/polars/polars-core/src/utils/mod.rs +++ b/polars/polars-core/src/utils/mod.rs @@ -9,6 +9,7 @@ use num_traits::{One, Zero}; pub use polars_arrow::utils::{TrustMyLength, *}; use rayon::prelude::*; pub use series::*; +use smartstring::alias::String as SmartString; pub use supertype::*; pub use {arrow, rayon}; @@ -824,6 +825,16 @@ where } } +impl IntoVec for I +where + I: IntoIterator, + S: AsRef, +{ + fn into_vec(self) -> Vec { + self.into_iter().map(|s| s.as_ref().into()).collect() + } +} + /// This logic is same as the impl on ChunkedArray /// The difference is that there is less indirection because the caller should preallocate /// `chunk_lens` once. On the `ChunkedArray` we indirect through an `ArrayRef` which is an indirection diff --git a/polars/polars-io/src/ndjson_core/buffer.rs b/polars/polars-io/src/ndjson_core/buffer.rs index b7a4ee0b3317..2691396c3bec 100644 --- a/polars/polars-io/src/ndjson_core/buffer.rs +++ b/polars/polars-io/src/ndjson_core/buffer.rs @@ -128,7 +128,7 @@ pub(crate) fn init_buffers( .iter() .map(|(name, dtype)| { let av_buf = (dtype, capacity).into(); - let key = KnownKey::from(name); + let key = KnownKey::from(name.as_str()); Ok((BufferKey(key), Buffer(name, av_buf))) }) .collect() diff --git a/polars/polars-io/src/parquet/predicates.rs b/polars/polars-io/src/parquet/predicates.rs index 7f8f0ddfef46..83a3628530e7 100644 --- a/polars/polars-io/src/parquet/predicates.rs +++ b/polars/polars-io/src/parquet/predicates.rs @@ -127,7 +127,7 @@ pub(crate) fn collect_statistics( // we select a single row group and collect only those stats Some(rg) => deserialize(fld, &md[rg..rg + 1])?, }; - schema.with_column(fld.name.to_string(), (&fld.data_type).into()); + schema.with_column((&fld.name).into(), (&fld.data_type).into()); stats.push(ColumnStats(st, Field::from(fld))); } diff --git a/polars/polars-lazy/Cargo.toml b/polars/polars-lazy/Cargo.toml index ff656e3fda58..de86df4482cb 100644 --- a/polars/polars-lazy/Cargo.toml +++ b/polars/polars-lazy/Cargo.toml @@ -26,6 +26,7 @@ polars-time = { version = "0.27.2", path = "../polars-time", optional = true } polars-utils = { version = "0.27.2", path = "../polars-utils" } pyo3 = { version = "0.18", optional = true } rayon.workspace = true +smartstring.workspace = true [features] nightly = ["polars-core/nightly", "polars-pipe/nightly"] diff --git a/polars/polars-lazy/polars-plan/Cargo.toml b/polars/polars-lazy/polars-plan/Cargo.toml index 34f6c6f5bbbb..059302c26449 100644 --- a/polars/polars-lazy/polars-plan/Cargo.toml +++ b/polars/polars-lazy/polars-plan/Cargo.toml @@ -22,6 +22,7 @@ pyo3 = { version = "0.18", optional = true } rayon.workspace = true regex = { version = "1.6", optional = true } serde = { version = "1", features = ["derive", "rc"], optional = true } +smartstring.workspace = true [features] # debuging utility diff --git a/polars/polars-lazy/polars-plan/src/dsl/string.rs b/polars/polars-lazy/polars-plan/src/dsl/string.rs index 90733ee4ccd1..2b540808fae9 100644 --- a/polars/polars-lazy/polars-plan/src/dsl/string.rs +++ b/polars/polars-lazy/polars-plan/src/dsl/string.rs @@ -1,6 +1,7 @@ use polars_arrow::array::ValueSize; #[cfg(feature = "dtype-struct")] use polars_arrow::export::arrow::array::{MutableArray, MutableUtf8Array}; +use polars_utils::format_smartstring; use super::function_expr::StringFunction; use super::*; @@ -217,7 +218,9 @@ impl StringNameSpace { function, GetOutput::from_type(DataType::Struct( (0..n + 1) - .map(|i| Field::from_owned(format!("field_{i}"), DataType::Utf8)) + .map(|i| { + Field::from_owned(format_smartstring!("field_{i}"), DataType::Utf8) + }) .collect(), )), ) @@ -269,7 +272,9 @@ impl StringNameSpace { function, GetOutput::from_type(DataType::Struct( (0..n + 1) - .map(|i| Field::from_owned(format!("field_{i}"), DataType::Utf8)) + .map(|i| { + Field::from_owned(format_smartstring!("field_{i}"), DataType::Utf8) + }) .collect(), )), ) @@ -321,7 +326,9 @@ impl StringNameSpace { function, GetOutput::from_type(DataType::Struct( (0..n) - .map(|i| Field::from_owned(format!("field_{i}"), DataType::Utf8)) + .map(|i| { + Field::from_owned(format_smartstring!("field_{i}"), DataType::Utf8) + }) .collect(), )), ) diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/builder.rs b/polars/polars-lazy/polars-plan/src/logical_plan/builder.rs index 55d3f8b83149..7c651ac0c5ce 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/builder.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/builder.rs @@ -374,7 +374,7 @@ impl LogicalPlanBuilder { let field = e .to_field_amortized(&schema, Context::Default, &mut arena) .unwrap(); - new_schema.with_column(field.name().to_string(), field.data_type().clone()); + new_schema.with_column(field.name().clone(), field.data_type().clone()); arena.clear(); } @@ -610,7 +610,7 @@ impl LogicalPlanBuilder { if let Expr::Column(name) = e { if let Some(DataType::List(inner)) = schema.get(name) { let inner = *inner.clone(); - schema.with_column(name.to_string(), inner); + schema.with_column(name.as_ref().into(), inner); } (**name).to_owned() @@ -739,12 +739,12 @@ pub(crate) fn det_melt_schema(args: &MeltArgs, input_schema: &Schema) -> SchemaR .variable_name .as_ref() .cloned() - .unwrap_or_else(|| "variable".to_string()); + .unwrap_or_else(|| "variable".into()); let value_name = args .value_name .as_ref() .cloned() - .unwrap_or_else(|| "value".to_string()); + .unwrap_or_else(|| "value".into()); new_schema.with_column(variable_name, DataType::Utf8); diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/functions/drop.rs b/polars/polars-lazy/polars-plan/src/logical_plan/functions/drop.rs index d1ae0c74d3f5..66803611355d 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/functions/drop.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/functions/drop.rs @@ -1,6 +1,6 @@ use super::*; -pub(super) fn drop_impl(mut df: DataFrame, names: &[String]) -> PolarsResult { +pub(super) fn drop_impl(mut df: DataFrame, names: &[SmartString]) -> PolarsResult { for name in names { // ignore names that are not in there // they might already be removed by projection pushdown @@ -14,7 +14,7 @@ pub(super) fn drop_impl(mut df: DataFrame, names: &[String]) -> PolarsResult( input_schema: &'a SchemaRef, - names: &[String], + names: &[SmartString], ) -> PolarsResult> { let to_drop = PlHashSet::from_iter(names); diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/functions/mod.rs b/polars/polars-lazy/polars-plan/src/logical_plan/functions/mod.rs index 64b73f75447b..af68250067c2 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/functions/mod.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/functions/mod.rs @@ -2,7 +2,6 @@ mod drop; #[cfg(feature = "merge_sorted")] mod merge_sorted; mod rename; - use std::borrow::Cow; use std::fmt::{Debug, Display, Formatter}; use std::sync::Arc; @@ -12,6 +11,7 @@ use polars_core::prelude::*; use polars_core::IUseStringCache; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +use smartstring::alias::String as SmartString; #[cfg(feature = "merge_sorted")] use crate::logical_plan::functions::merge_sorted::merge_sorted; @@ -59,13 +59,13 @@ pub enum FunctionNode { column: Arc, }, Rename { - existing: Arc>, - new: Arc>, + existing: Arc>, + new: Arc>, // A column name gets swapped with an existing column swapping: bool, }, Drop { - names: Arc>, + names: Arc>, }, } diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/functions/rename.rs b/polars/polars-lazy/polars-plan/src/logical_plan/functions/rename.rs index 9c6ba2e1ec15..0a10d78ec352 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/functions/rename.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/functions/rename.rs @@ -2,8 +2,8 @@ use super::*; pub(super) fn rename_impl( mut df: DataFrame, - existing: &[String], - new: &[String], + existing: &[SmartString], + new: &[SmartString], ) -> PolarsResult { let positions = existing .iter() @@ -24,8 +24,8 @@ pub(super) fn rename_impl( pub(super) fn rename_schema<'a>( input_schema: &'a SchemaRef, - existing: &[String], - new: &[String], + existing: &[SmartString], + new: &[SmartString], ) -> PolarsResult> { let mut new_schema = (**input_schema).clone(); for (old, new) in existing.iter().zip(new.iter()) { diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/predicate_pushdown/rename.rs b/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/predicate_pushdown/rename.rs index afc6728d85cf..4c6588a88533 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/predicate_pushdown/rename.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/predicate_pushdown/rename.rs @@ -1,3 +1,5 @@ +use smartstring::alias::String as SmartString; + use super::*; use crate::prelude::optimizer::predicate_pushdown::keys::{key_has_name, predicate_to_key}; @@ -21,8 +23,8 @@ fn remove_any_key_referencing_renamed( pub(super) fn process_rename( acc_predicates: &mut PlHashMap, Node>, expr_arena: &mut Arena, - existing: &[String], - new: &[String], + existing: &[SmartString], + new: &[SmartString], ) -> PolarsResult> { let mut local_predicates = vec![]; for (existing, new) in existing.iter().zip(new.iter()) { diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/rename.rs b/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/rename.rs index f934ea009dcb..7caa3aff226f 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/rename.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/optimizer/projection_pushdown/rename.rs @@ -1,5 +1,7 @@ use std::collections::BTreeSet; +use smartstring::alias::String as SmartString; + use super::*; fn iter_and_update_nodes( @@ -25,8 +27,8 @@ pub(super) fn process_rename( acc_projections: &mut [Node], projected_names: &mut PlHashSet>, expr_arena: &mut Arena, - existing: &[String], - new: &[String], + existing: &[SmartString], + new: &[SmartString], swapping: bool, ) -> PolarsResult<()> { let mut processed = BTreeSet::new(); diff --git a/polars/polars-lazy/polars-plan/src/logical_plan/schema.rs b/polars/polars-lazy/polars-plan/src/logical_plan/schema.rs index d1b83b7e8c9a..1455a8a70a0e 100644 --- a/polars/polars-lazy/polars-plan/src/logical_plan/schema.rs +++ b/polars/polars-lazy/polars-plan/src/logical_plan/schema.rs @@ -1,4 +1,5 @@ use polars_core::prelude::*; +use polars_utils::format_smartstring; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -197,7 +198,7 @@ pub(crate) fn det_join_schema( for (name, dtype) in schema_left.iter() { names.insert(name.as_str()); - new_schema.with_column(name.to_string(), dtype.clone()); + new_schema.with_column(name.clone(), dtype.clone()); } // make sure that expression are assigned to the schema @@ -224,7 +225,8 @@ pub(crate) fn det_join_schema( if schema_left.contains(&field_right.name) { use polars_core::frame::hash_join::_join_suffix_name; new_schema.with_column( - _join_suffix_name(&field_right.name, options.suffix.as_ref()), + _join_suffix_name(&field_right.name, options.suffix.as_ref()) + .into(), field_right.dtype, ); } else { @@ -257,10 +259,10 @@ pub(crate) fn det_join_schema( } } - let new_name = format!("{}{}", name, options.suffix.as_ref()); + let new_name = format_smartstring!("{}{}", name, options.suffix.as_ref()); new_schema.with_column(new_name, dtype.clone()); } else { - new_schema.with_column(name.to_string(), dtype.clone()); + new_schema.with_column(name.clone(), dtype.clone()); } } } diff --git a/polars/polars-lazy/polars-plan/src/utils.rs b/polars/polars-lazy/polars-plan/src/utils.rs index 34fab32eb9f7..54dee4557e7e 100644 --- a/polars/polars-lazy/polars-plan/src/utils.rs +++ b/polars/polars-lazy/polars-plan/src/utils.rs @@ -3,6 +3,7 @@ use std::iter::FlatMap; use std::sync::Arc; use polars_core::prelude::*; +use smartstring::alias::String as SmartString; use crate::logical_plan::iterator::ArenaExprIter; use crate::logical_plan::Context; @@ -10,7 +11,7 @@ use crate::prelude::names::COUNT; use crate::prelude::*; /// Utility to write comma delimited -pub fn column_delimited(mut s: String, items: &[String]) -> String { +pub fn column_delimited(mut s: String, items: &[SmartString]) -> String { s.push('('); for c in items { s.push_str(c); diff --git a/polars/polars-lazy/src/frame/mod.rs b/polars/polars-lazy/src/frame/mod.rs index 09cd91f5568a..bd24af1d24a0 100644 --- a/polars/polars-lazy/src/frame/mod.rs +++ b/polars/polars-lazy/src/frame/mod.rs @@ -41,6 +41,7 @@ use polars_plan::global::FETCH_ROWS; use polars_plan::logical_plan::collect_fingerprints; use polars_plan::logical_plan::optimize; use polars_plan::utils::expr_to_leaf_column_names; +use smartstring::alias::String as SmartString; use crate::physical_plan::executors::Executor; use crate::physical_plan::planner::create_physical_plan; @@ -270,7 +271,7 @@ impl LazyFrame { /// Check the if the `names` are available in the `schema`, if not /// return a `LogicalPlan` that raises an `Error`. - fn check_names(&self, names: &[String], schema: Option<&SchemaRef>) -> Option { + fn check_names(&self, names: &[SmartString], schema: Option<&SchemaRef>) -> Option { let schema = schema .map(Cow::Borrowed) .unwrap_or_else(|| Cow::Owned(self.schema().unwrap())); @@ -306,16 +307,16 @@ impl LazyFrame { { let iter = existing.into_iter(); let cap = iter.size_hint().0; - let mut existing_vec = Vec::with_capacity(cap); - let mut new_vec = Vec::with_capacity(cap); + let mut existing_vec: Vec = Vec::with_capacity(cap); + let mut new_vec: Vec = Vec::with_capacity(cap); for (existing, new) in iter.zip(new.into_iter()) { let existing = existing.as_ref(); let new = new.as_ref(); if new != existing { - existing_vec.push(existing.to_string()); - new_vec.push(new.to_string()); + existing_vec.push(existing.into()); + new_vec.push(new.into()); } } @@ -342,15 +343,15 @@ impl LazyFrame { I: IntoIterator, T: AsRef, { - let columns: Vec = columns + let columns: Vec = columns .into_iter() - .map(|name| name.as_ref().to_string()) + .map(|name| name.as_ref().into()) .collect(); self.drop_columns_impl(columns) } #[allow(clippy::ptr_arg)] - fn drop_columns_impl(self, columns: Vec) -> Self { + fn drop_columns_impl(self, columns: Vec) -> Self { if let Some(lp) = self.check_names(&columns, None) { lp } else { @@ -1132,7 +1133,7 @@ impl LazyFrame { } } - let name2 = name.to_string(); + let name2: SmartString = name.into(); let udf_schema = move |s: &Schema| { let new = s.insert_index(0, name2.clone(), IDX_DTYPE).unwrap(); Ok(Arc::new(new)) diff --git a/polars/polars-lazy/src/physical_plan/executors/scan/csv.rs b/polars/polars-lazy/src/physical_plan/executors/scan/csv.rs index 37f14e3f74a0..2080f2b7e977 100644 --- a/polars/polars-lazy/src/physical_plan/executors/scan/csv.rs +++ b/polars/polars-lazy/src/physical_plan/executors/scan/csv.rs @@ -61,9 +61,9 @@ impl Executor for CsvExec { }; let profile_name = if state.has_node_timer() { - let mut ids = vec![self.path.to_string_lossy().to_string()]; + let mut ids = vec![self.path.to_string_lossy().into()]; if self.predicate.is_some() { - ids.push("predicate".to_string()) + ids.push("predicate".into()) } let name = column_delimited("csv".to_string(), &ids); Cow::Owned(name) diff --git a/polars/polars-lazy/src/physical_plan/executors/scan/ipc.rs b/polars/polars-lazy/src/physical_plan/executors/scan/ipc.rs index ad7bc9b3b196..08437546c667 100644 --- a/polars/polars-lazy/src/physical_plan/executors/scan/ipc.rs +++ b/polars/polars-lazy/src/physical_plan/executors/scan/ipc.rs @@ -38,9 +38,9 @@ impl Executor for IpcExec { }; let profile_name = if state.has_node_timer() { - let mut ids = vec![self.path.to_string_lossy().to_string()]; + let mut ids = vec![self.path.to_string_lossy().into()]; if self.predicate.is_some() { - ids.push("predicate".to_string()) + ids.push("predicate".into()) } let name = column_delimited("ipc".to_string(), &ids); Cow::Owned(name) diff --git a/polars/polars-lazy/src/physical_plan/executors/scan/parquet.rs b/polars/polars-lazy/src/physical_plan/executors/scan/parquet.rs index ab74c9d6d339..784f510fbc44 100644 --- a/polars/polars-lazy/src/physical_plan/executors/scan/parquet.rs +++ b/polars/polars-lazy/src/physical_plan/executors/scan/parquet.rs @@ -60,9 +60,9 @@ impl Executor for ParquetExec { }; let profile_name = if state.has_node_timer() { - let mut ids = vec![self.path.to_string_lossy().to_string()]; + let mut ids = vec![self.path.to_string_lossy().into()]; if self.predicate.is_some() { - ids.push("predicate".to_string()) + ids.push("predicate".into()) } let name = column_delimited("parquet".to_string(), &ids); Cow::Owned(name) diff --git a/polars/polars-lazy/src/physical_plan/expressions/window.rs b/polars/polars-lazy/src/physical_plan/expressions/window.rs index 319d889df592..17fbc3ddfb3a 100644 --- a/polars/polars-lazy/src/physical_plan/expressions/window.rs +++ b/polars/polars-lazy/src/physical_plan/expressions/window.rs @@ -12,6 +12,7 @@ use polars_core::series::IsSorted; use polars_core::utils::_split_offsets; use polars_core::utils::arrow::bitmap::MutableBitmap; use polars_core::{downcast_as_macro_arg_physical, POOL}; +use polars_utils::format_smartstring; use polars_utils::sort::perfect_sort; use polars_utils::sync::SyncPtr; use rayon::prelude::*; @@ -180,7 +181,7 @@ impl WindowExpr { let first = group.first(); let group = groupby_columns .iter() - .map(|s| format!("{}", s.get(first as usize).unwrap())) + .map(|s| format_smartstring!("{}", s.get(first as usize).unwrap())) .collect::>(); let err_msg = format!( "{}\n> Group: ", diff --git a/polars/polars-lazy/src/tests/queries.rs b/polars/polars-lazy/src/tests/queries.rs index c425fe7db76b..66740108cdb6 100644 --- a/polars/polars-lazy/src/tests/queries.rs +++ b/polars/polars-lazy/src/tests/queries.rs @@ -50,8 +50,8 @@ fn test_lazy_melt() { let df = get_df(); let args = MeltArgs { - id_vars: vec!["petal.width".to_string(), "petal.length".to_string()], - value_vars: vec!["sepal.length".to_string(), "sepal.width".to_string()], + id_vars: vec!["petal.width".into(), "petal.length".into()], + value_vars: vec!["sepal.length".into(), "sepal.width".into()], variable_name: None, value_name: None, }; diff --git a/polars/polars-ops/Cargo.toml b/polars/polars-ops/Cargo.toml index 2fc46c34df5a..89de01aceb60 100644 --- a/polars/polars-ops/Cargo.toml +++ b/polars/polars-ops/Cargo.toml @@ -20,6 +20,7 @@ polars-core = { version = "0.27.2", path = "../polars-core", features = ["privat polars-utils = { version = "0.27.2", path = "../polars-utils", default-features = false } serde = { version = "1", features = ["derive"], optional = true } serde_json = { version = "1", optional = true } +smartstring.workspace = true [features] nightly = ["polars-utils/nightly"] diff --git a/polars/polars-ops/src/chunked_array/list/to_struct.rs b/polars/polars-ops/src/chunked_array/list/to_struct.rs index da506edd17f8..7f7a12a03a95 100644 --- a/polars/polars-ops/src/chunked_array/list/to_struct.rs +++ b/polars/polars-ops/src/chunked_array/list/to_struct.rs @@ -1,4 +1,6 @@ use polars_core::export::rayon::prelude::*; +use polars_utils::format_smartstring; +use smartstring::alias::String as SmartString; use super::*; @@ -45,10 +47,10 @@ fn det_n_fields(ca: &ListChunked, n_fields: ListToStructWidthStrategy) -> usize } } -pub type NameGenerator = Arc String + Send + Sync>; +pub type NameGenerator = Arc SmartString + Send + Sync>; -pub fn _default_struct_name_gen(idx: usize) -> String { - format!("field_{idx}") +pub fn _default_struct_name_gen(idx: usize) -> SmartString { + format_smartstring!("field_{idx}") } pub trait ToStruct: AsList { diff --git a/polars/polars-time/Cargo.toml b/polars/polars-time/Cargo.toml index 962d85a50fc8..b2cb519a2577 100644 --- a/polars/polars-time/Cargo.toml +++ b/polars/polars-time/Cargo.toml @@ -20,6 +20,7 @@ polars-ops = { version = "0.27.2", path = "../polars-ops" } polars-utils = { version = "0.27.2", path = "../polars-utils" } regex = "1.7.1" serde = { version = "1", features = ["derive"], optional = true } +smartstring.workspace = true [features] dtype-date = ["polars-core/dtype-date", "polars-core/temporal"] diff --git a/polars/polars-time/src/groupby/dynamic.rs b/polars/polars-time/src/groupby/dynamic.rs index 13ac82522ed0..8d79b3ff5d0f 100644 --- a/polars/polars-time/src/groupby/dynamic.rs +++ b/polars/polars-time/src/groupby/dynamic.rs @@ -5,6 +5,7 @@ use polars_core::prelude::*; use polars_core::POOL; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +use smartstring::alias::String as SmartString; use crate::prelude::*; @@ -15,7 +16,7 @@ struct Wrap(pub T); #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct DynamicGroupOptions { /// Time or index column - pub index_column: String, + pub index_column: SmartString, /// start a window at this interval pub every: Duration, /// window duration @@ -33,7 +34,7 @@ pub struct DynamicGroupOptions { impl Default for DynamicGroupOptions { fn default() -> Self { Self { - index_column: "".to_string(), + index_column: "".into(), every: Duration::new(1), period: Duration::new(1), offset: Duration::new(1), @@ -49,7 +50,7 @@ impl Default for DynamicGroupOptions { #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub struct RollingGroupOptions { /// Time or index column - pub index_column: String, + pub index_column: SmartString, /// window duration pub period: Duration, pub offset: Duration, diff --git a/polars/polars-utils/Cargo.toml b/polars/polars-utils/Cargo.toml index f666daa5453c..07c1900b796c 100644 --- a/polars/polars-utils/Cargo.toml +++ b/polars/polars-utils/Cargo.toml @@ -11,6 +11,7 @@ description = "private utils for the polars dataframe library" [dependencies] once_cell.workspace = true rayon.workspace = true +smartstring.workspace = true sysinfo = { version = "0.28", default-features = false, optional = true } [features] diff --git a/polars/polars-utils/src/fmt.rs b/polars/polars-utils/src/fmt.rs new file mode 100644 index 000000000000..8677c81fbcda --- /dev/null +++ b/polars/polars-utils/src/fmt.rs @@ -0,0 +1,11 @@ +#[macro_export] +macro_rules! format_smartstring { + ($($arg:tt)*) => {{ + use smartstring::alias::String as SmartString; + use std::fmt::Write; + + let mut string = SmartString::new(); + write!(string, $($arg)*).unwrap(); + string + }} +} diff --git a/polars/polars-utils/src/lib.rs b/polars/polars-utils/src/lib.rs index a827b90402e2..027ea4305954 100644 --- a/polars/polars-utils/src/lib.rs +++ b/polars/polars-utils/src/lib.rs @@ -24,6 +24,7 @@ pub type IdxSize = u32; #[cfg(feature = "bigidx")] pub type IdxSize = u64; +pub mod fmt; pub mod iter; pub mod macros; #[cfg(target_family = "wasm")] diff --git a/polars/tests/it/io/csv.rs b/polars/tests/it/io/csv.rs index a1888d4d9c1d..ddc63e12ab34 100644 --- a/polars/tests/it/io/csv.rs +++ b/polars/tests/it/io/csv.rs @@ -362,7 +362,7 @@ fn test_empty_bytes_to_dataframe() { let result = CsvReader::new(file) .has_header(false) - .with_columns(Some(schema.iter_names().cloned().collect())) + .with_columns(Some(schema.iter_names().map(|s| s.to_string()).collect())) .with_schema(&schema) .finish(); assert!(result.is_ok()) diff --git a/polars/tests/it/io/json.rs b/polars/tests/it/io/json.rs index 2d20b991c2df..b3b65d8b23cd 100644 --- a/polars/tests/it/io/json.rs +++ b/polars/tests/it/io/json.rs @@ -145,14 +145,14 @@ fn test_read_ndjson_iss_5875() { let mut schema = Schema::new(); schema.with_column( - "struct".to_owned(), + "struct".into(), DataType::Struct(vec![ field_int_inner.clone(), field_float_inner.clone(), field_str_inner.clone(), ]), ); - schema.with_column("float".to_owned(), DataType::Float64); + schema.with_column("float".into(), DataType::Float64); assert_eq!(schema, df.unwrap().schema()); } @@ -178,11 +178,11 @@ fn test_read_ndjson_iss_5875_part2() { ); let mut schema = Schema::new(); schema.with_column( - "struct".to_owned(), + "struct".into(), DataType::Struct(vec![field_int_list_inner, field_float, field_str_list]), ); schema.with_column( - "float_list_outer".to_owned(), + "float_list_outer".into(), field_float_list.data_type().clone(), ); diff --git a/polars/tests/it/schema.rs b/polars/tests/it/schema.rs index 7c90a699bed1..479986e52303 100644 --- a/polars/tests/it/schema.rs +++ b/polars/tests/it/schema.rs @@ -11,7 +11,7 @@ fn test_schema_rename() { ] .into_iter(), ); - schema.rename("a", "anton".to_string()).unwrap(); + schema.rename("a", "anton".into()).unwrap(); let expected = Schema::from( [ Field::new("anton", UInt64), diff --git a/py-polars/Cargo.lock b/py-polars/Cargo.lock index 2d331c5bd298..e56121681f9d 100644 --- a/py-polars/Cargo.lock +++ b/py-polars/Cargo.lock @@ -1580,6 +1580,7 @@ dependencies = [ "polars-utils", "pyo3", "rayon", + "smartstring", ] [[package]] @@ -1596,6 +1597,7 @@ dependencies = [ "polars-utils", "serde", "serde_json", + "smartstring", ] [[package]] @@ -1633,6 +1635,7 @@ dependencies = [ "rayon", "regex", "serde", + "smartstring", ] [[package]] @@ -1672,6 +1675,7 @@ dependencies = [ "polars-utils", "regex", "serde", + "smartstring", ] [[package]] @@ -1680,6 +1684,7 @@ version = "0.27.2" dependencies = [ "once_cell", "rayon", + "smartstring", "sysinfo", ] @@ -1717,6 +1722,7 @@ dependencies = [ "pyo3", "pyo3-built", "serde_json", + "smartstring", "thiserror", ] @@ -2071,6 +2077,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3fb72c633efbaa2dd666986505016c32c3044395ceaf881518399d2f4127ee29" dependencies = [ "autocfg", + "serde", "static_assertions", "version_check", ] diff --git a/py-polars/Cargo.toml b/py-polars/Cargo.toml index e2b82ed57ec5..80d6bf92352b 100644 --- a/py-polars/Cargo.toml +++ b/py-polars/Cargo.toml @@ -26,6 +26,7 @@ polars-lazy = { path = "../polars/polars-lazy", features = ["python"], default-f pyo3 = { version = "0.18.0", features = ["abi3-py37", "extension-module", "multiple-pymethods"] } pyo3-built = { version = "0.4", optional = true } serde_json = { version = "1", optional = true } +smartstring = "1" thiserror = "^1.0" # features are only there to enable building a slim binary for the benchmark in CI diff --git a/py-polars/src/conversion.rs b/py-polars/src/conversion.rs index cfb00adbeb3e..0f7d0b59c8bb 100644 --- a/py-polars/src/conversion.rs +++ b/py-polars/src/conversion.rs @@ -20,6 +20,7 @@ use pyo3::exceptions::{PyTypeError, PyValueError}; use pyo3::prelude::*; use pyo3::types::{PyBool, PyBytes, PyDict, PyList, PySequence}; use pyo3::{PyAny, PyResult}; +use smartstring::alias::String as SmartString; use crate::dataframe::PyDataFrame; use crate::error::PyPolarsErr; @@ -173,7 +174,7 @@ fn struct_dict<'a>( ) -> PyObject { let dict = PyDict::new(py); for (fld, val) in flds.iter().zip(vals) { - dict.set_item(fld.name(), Wrap(val)).unwrap() + dict.set_item(fld.name().as_str(), Wrap(val)).unwrap() } dict.into_py(py) } @@ -297,7 +298,7 @@ impl ToPyObject for Wrap { DataType::Struct(fields) => { let field_class = pl.getattr("Field").unwrap(); let iter = fields.iter().map(|fld| { - let name = fld.name().clone(); + let name = fld.name().as_str(); let dtype = Wrap(fld.data_type().clone()).to_object(py); field_class.call1((name, dtype)).unwrap() }); @@ -1178,3 +1179,11 @@ pub(crate) fn parse_parquet_compression( }; Ok(parsed) } + +pub(crate) fn strings_to_smartstrings(container: I) -> Vec +where + I: IntoIterator, + S: AsRef, +{ + container.into_iter().map(|s| s.as_ref().into()).collect() +} diff --git a/py-polars/src/dataframe.rs b/py-polars/src/dataframe.rs index 6d579421248a..371046caaf4f 100644 --- a/py-polars/src/dataframe.rs +++ b/py-polars/src/dataframe.rs @@ -31,7 +31,7 @@ use crate::conversion::{ObjectValue, Wrap}; use crate::error::PyPolarsErr; use crate::file::{get_either_file, get_file_like, get_mmap_bytes_reader, EitherRustPythonFile}; use crate::lazy::dataframe::PyLazyFrame; -use crate::prelude::dicts_to_rows; +use crate::prelude::{dicts_to_rows, strings_to_smartstrings}; use crate::series::{to_pyseries_collection, to_series_collection, PySeries}; use crate::{arrow_interop, py_modules, PyExpr}; @@ -1093,16 +1093,16 @@ impl PyDataFrame { pub fn melt( &self, - id_vars: Vec, - value_vars: Vec, - value_name: Option, - variable_name: Option, + id_vars: Vec<&str>, + value_vars: Vec<&str>, + value_name: Option<&str>, + variable_name: Option<&str>, ) -> PyResult { let args = MeltArgs { - id_vars, - value_vars, - value_name, - variable_name, + id_vars: strings_to_smartstrings(id_vars), + value_vars: strings_to_smartstrings(value_vars), + value_name: value_name.map(|s| s.into()), + variable_name: variable_name.map(|s| s.into()), }; let df = self.df.melt2(args).map_err(PyPolarsErr::from)?; diff --git a/py-polars/src/lazy/dataframe.rs b/py-polars/src/lazy/dataframe.rs index 2a90f23f76a6..379a971b01da 100644 --- a/py-polars/src/lazy/dataframe.rs +++ b/py-polars/src/lazy/dataframe.rs @@ -259,7 +259,7 @@ impl PyLazyFrame { if let Some(lambda) = with_schema_modify { let f = |schema: Schema| { - let iter = schema.iter_names(); + let iter = schema.iter_names().map(|s| s.as_str()); Python::with_gil(|py| { let names = PyList::new(py, iter); @@ -272,7 +272,7 @@ impl PyLazyFrame { let fields = schema .iter_dtypes() .zip(new_names) - .map(|(dtype, name)| Field::from_owned(name, dtype.clone())); + .map(|(dtype, name)| Field::from_owned(name.into(), dtype.clone())); Ok(Schema::from(fields)) }) }; @@ -541,7 +541,7 @@ impl PyLazyFrame { pub fn groupby_rolling( &mut self, - index_column: String, + index_column: &str, period: &str, offset: &str, closed: Wrap, @@ -556,7 +556,7 @@ impl PyLazyFrame { let lazy_gb = ldf.groupby_rolling( by, RollingGroupOptions { - index_column, + index_column: index_column.into(), period: Duration::parse(period), offset: Duration::parse(offset), closed_window, @@ -569,7 +569,7 @@ impl PyLazyFrame { #[allow(clippy::too_many_arguments)] pub fn groupby_dynamic( &mut self, - index_column: String, + index_column: &str, every: &str, period: &str, offset: &str, @@ -588,7 +588,7 @@ impl PyLazyFrame { let lazy_gb = ldf.groupby_dynamic( by, DynamicGroupOptions { - index_column, + index_column: index_column.into(), every: Duration::parse(every), period: Duration::parse(period), offset: Duration::parse(offset), @@ -615,8 +615,8 @@ impl PyLazyFrame { other: PyLazyFrame, left_on: PyExpr, right_on: PyExpr, - left_by: Option>, - right_by: Option>, + left_by: Option>, + right_by: Option>, allow_parallel: bool, force_parallel: bool, suffix: String, @@ -637,10 +637,10 @@ impl PyLazyFrame { .force_parallel(force_parallel) .how(JoinType::AsOf(AsOfOptions { strategy: strategy.0, - left_by, - right_by, + left_by: left_by.map(strings_to_smartstrings), + right_by: right_by.map(strings_to_smartstrings), tolerance: tolerance.map(|t| t.0.into_static().unwrap()), - tolerance_str, + tolerance_str: tolerance_str.map(|s| s.into()), })) .suffix(suffix) .finish() @@ -802,10 +802,10 @@ impl PyLazyFrame { variable_name: Option, ) -> Self { let args = MeltArgs { - id_vars, - value_vars, - value_name, - variable_name, + id_vars: strings_to_smartstrings(id_vars), + value_vars: strings_to_smartstrings(value_vars), + value_name: value_name.map(|s| s.into()), + variable_name: variable_name.map(|s| s.into()), }; let ldf = self.ldf.clone(); @@ -915,8 +915,10 @@ impl PyLazyFrame { self.ldf.clone().into() } - pub fn columns(&self) -> PyResult> { - Ok(self.get_schema()?.iter_names().cloned().collect()) + pub fn columns(&self, py: Python) -> PyResult { + let schema = self.get_schema()?; + let iter = schema.iter_names().map(|s| s.as_str()); + Ok(PyList::new(py, iter).to_object(py)) } pub fn dtypes(&self, py: Python) -> PyResult { @@ -933,7 +935,7 @@ impl PyLazyFrame { schema.iter_fields().for_each(|fld| { schema_dict - .set_item(fld.name(), Wrap(fld.data_type().clone())) + .set_item(fld.name().as_str(), Wrap(fld.data_type().clone())) .unwrap() }); Ok(schema_dict.to_object(py)) diff --git a/py-polars/src/lazy/dsl.rs b/py-polars/src/lazy/dsl.rs index b280345cb1df..a930723f3cfa 100644 --- a/py-polars/src/lazy/dsl.rs +++ b/py-polars/src/lazy/dsl.rs @@ -8,6 +8,7 @@ use pyo3::class::basic::CompareOp; use pyo3::exceptions::PyValueError; use pyo3::prelude::*; use pyo3::types::{PyBool, PyBytes, PyFloat, PyInt, PyString}; +use smartstring::alias::String as SmartString; use super::apply::*; use crate::conversion::{parse_fill_null_strategy, Wrap}; @@ -1605,7 +1606,8 @@ impl PyExpr { Arc::new(move |idx: usize| { Python::with_gil(|py| { let out = lambda.call1(py, (idx,)).unwrap(); - out.extract::(py).unwrap() + let out: SmartString = out.extract::<&str>(py).unwrap().into(); + out }) }) as NameGenerator });