Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions datafusion/functions/src/string/concat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ use std::sync::Arc;

use crate::string::concat;
use crate::strings::{
ColumnarValueRef, LargeStringArrayBuilder, StringArrayBuilder, StringViewArrayBuilder,
ColumnarValueRef, ConcatLargeStringBuilder, ConcatStringBuilder,
ConcatStringViewBuilder,
};
use datafusion_common::cast::{as_binary_array, as_string_array, as_string_view_array};
use datafusion_common::{
Expand Down Expand Up @@ -242,7 +243,7 @@ impl ScalarUDFImpl for ConcatFunc {

match return_datatype {
DataType::Utf8 => {
let mut builder = StringArrayBuilder::with_capacity(len, data_size);
let mut builder = ConcatStringBuilder::with_capacity(len, data_size);
for i in 0..len {
columns
.iter()
Expand All @@ -254,7 +255,7 @@ impl ScalarUDFImpl for ConcatFunc {
Ok(ColumnarValue::Array(Arc::new(string_array)))
}
DataType::Utf8View => {
let mut builder = StringViewArrayBuilder::with_capacity(len, data_size);
let mut builder = ConcatStringViewBuilder::with_capacity(len, data_size);
for i in 0..len {
columns
.iter()
Expand All @@ -266,7 +267,7 @@ impl ScalarUDFImpl for ConcatFunc {
Ok(ColumnarValue::Array(Arc::new(string_array)))
}
DataType::LargeUtf8 => {
let mut builder = LargeStringArrayBuilder::with_capacity(len, data_size);
let mut builder = ConcatLargeStringBuilder::with_capacity(len, data_size);
for i in 0..len {
columns
.iter()
Expand Down
9 changes: 5 additions & 4 deletions datafusion/functions/src/string/concat_ws.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ use crate::string::concat;
use crate::string::concat::simplify_concat;
use crate::string::concat_ws;
use crate::strings::{
ColumnarValueRef, LargeStringArrayBuilder, StringArrayBuilder, StringViewArrayBuilder,
ColumnarValueRef, ConcatLargeStringBuilder, ConcatStringBuilder,
ConcatStringViewBuilder,
};
use datafusion_common::cast::{
as_large_string_array, as_string_array, as_string_view_array,
Expand Down Expand Up @@ -311,7 +312,7 @@ impl ScalarUDFImpl for ConcatWsFunc {

match return_datatype {
DataType::Utf8View => {
let mut builder = StringViewArrayBuilder::with_capacity(len, data_size);
let mut builder = ConcatStringViewBuilder::with_capacity(len, data_size);
for i in 0..len {
if !sep.is_valid(i) {
builder.append_offset()?;
Expand All @@ -332,7 +333,7 @@ impl ScalarUDFImpl for ConcatWsFunc {
Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls())?)))
}
DataType::LargeUtf8 => {
let mut builder = LargeStringArrayBuilder::with_capacity(len, data_size);
let mut builder = ConcatLargeStringBuilder::with_capacity(len, data_size);
for i in 0..len {
if !sep.is_valid(i) {
builder.append_offset()?;
Expand All @@ -353,7 +354,7 @@ impl ScalarUDFImpl for ConcatWsFunc {
Ok(ColumnarValue::Array(Arc::new(builder.finish(sep.nulls())?)))
}
_ => {
let mut builder = StringArrayBuilder::with_capacity(len, data_size);
let mut builder = ConcatStringBuilder::with_capacity(len, data_size);
for i in 0..len {
if !sep.is_valid(i) {
builder.append_offset()?;
Expand Down
48 changes: 27 additions & 21 deletions datafusion/functions/src/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,20 @@ use arrow::array::{
use arrow::buffer::{Buffer, MutableBuffer, NullBuffer, ScalarBuffer};
use arrow::datatypes::DataType;

/// Optimized version of the StringBuilder in Arrow that:
/// 1. Precalculating the expected length of the result, avoiding reallocations.
/// 2. Avoids creating / incrementally creating a `NullBufferBuilder`
pub struct StringArrayBuilder {
/// Builder used by `concat`/`concat_ws` to assemble a [`StringArray`] one row
/// at a time from multiple input columns.
///
/// Each row is written via repeated `write` calls, followed by a single
/// `append_offset` call to commit the row. The output null buffer is supplied
/// by the caller at `finish` time.
pub(crate) struct ConcatStringBuilder {
offsets_buffer: MutableBuffer,
value_buffer: MutableBuffer,
/// If true, a safety check is required during the `finish` call
tainted: bool,
}

impl StringArrayBuilder {
impl ConcatStringBuilder {
pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
let capacity = item_capacity
.checked_add(1)
Expand Down Expand Up @@ -151,21 +154,21 @@ impl StringArrayBuilder {
}
}

/// Optimized version of Arrow's [`StringViewBuilder`]. Rather than adding NULLs
/// on a row-by-row basis, the caller should provide nulls when calling
/// [`finish`](Self::finish). This allows callers to compute nulls more
/// efficiently (e.g., via bulk bitmap operations).
/// Builder used by `concat`/`concat_ws` to assemble a [`StringViewArray`] one
/// row at a time from multiple input columns.
///
/// [`StringViewBuilder`]: arrow::array::StringViewBuilder
pub struct StringViewArrayBuilder {
/// Each row is written via repeated `write` calls, followed by a single
/// `append_offset` call to commit the row as a single string view. The output
/// null buffer is supplied by the caller at `finish` time.
pub(crate) struct ConcatStringViewBuilder {
views: Vec<u128>,
data: Vec<u8>,
block: Vec<u8>,
/// If true, a safety check is required during the `append_offset` call
tainted: bool,
}

impl StringViewArrayBuilder {
impl ConcatStringViewBuilder {
pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
Self {
views: Vec::with_capacity(item_capacity),
Expand Down Expand Up @@ -286,14 +289,17 @@ impl StringViewArrayBuilder {
}
}

pub struct LargeStringArrayBuilder {
/// Builder used by `concat`/`concat_ws` to assemble a [`LargeStringArray`] one
/// row at a time from multiple input columns. See [`ConcatStringBuilder`] for
/// details on the row-composition contract.
pub(crate) struct ConcatLargeStringBuilder {
offsets_buffer: MutableBuffer,
value_buffer: MutableBuffer,
/// If true, a safety check is required during the `finish` call
tainted: bool,
}

impl LargeStringArrayBuilder {
impl ConcatLargeStringBuilder {
pub fn with_capacity(item_capacity: usize, data_capacity: usize) -> Self {
let capacity = item_capacity
.checked_add(1)
Expand Down Expand Up @@ -424,9 +430,9 @@ impl LargeStringArrayBuilder {
/// - start_offset: The start offset of the substring in the view
///
/// LLVM is apparently overly eager to inline this function into some hot loops,
/// which bloats them and regresses performance, so we disable inling for now.
/// which bloats them and regresses performance, so we disable inlining for now.
#[inline(never)]
pub fn append_view(
pub(crate) fn append_view(
views_buffer: &mut Vec<u128>,
original_view: &u128,
substr: &str,
Expand All @@ -447,7 +453,7 @@ pub fn append_view(
}

#[derive(Debug)]
pub enum ColumnarValueRef<'a> {
pub(crate) enum ColumnarValueRef<'a> {
Scalar(&'a [u8]),
NullableArray(&'a StringArray),
NonNullableArray(&'a StringArray),
Expand Down Expand Up @@ -497,13 +503,13 @@ mod tests {

#[test]
#[should_panic(expected = "capacity integer overflow")]
fn test_overflow_string_array_builder() {
let _builder = StringArrayBuilder::with_capacity(usize::MAX, usize::MAX);
fn test_overflow_concat_string_builder() {
let _builder = ConcatStringBuilder::with_capacity(usize::MAX, usize::MAX);
}

#[test]
#[should_panic(expected = "capacity integer overflow")]
fn test_overflow_large_string_array_builder() {
let _builder = LargeStringArrayBuilder::with_capacity(usize::MAX, usize::MAX);
fn test_overflow_concat_large_string_builder() {
let _builder = ConcatLargeStringBuilder::with_capacity(usize::MAX, usize::MAX);
}
}
8 changes: 8 additions & 0 deletions docs/source/library-user-guide/upgrading/54.0.0.md
Original file line number Diff line number Diff line change
Expand Up @@ -320,4 +320,12 @@ The difference is only observable for strings containing combining characters
clusters (e.g., ZWJ emoji sequences). For ASCII and most common Unicode text,
behavior is unchanged.

### Items in `datafusion_functions::strings` are no longer public

`StringArrayBuilder`, `LargeStringArrayBuilder`, `StringViewArrayBuilder`,
`ColumnarValueRef`, and `append_view` have been reduced to `pub(crate)`. They
were only ever used to implement `concat` and `concat_ws` inside the crate. If
you were importing them externally, use Arrow's corresponding builders with a
caller-computed `NullBuffer`.

[#17861]: https://github.com/apache/datafusion/pull/17861
Loading