diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 8d52f555bbf76..062a3d14a0d86 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -3435,21 +3435,17 @@ mod tests { } #[tokio::test] - async fn in_list_types() -> Result<()> { - // expression: "a in ('a', 1)" + async fn in_list_types_mixed_string_int_error() -> Result<()> { + // expression: "c1 in ('a', 1)" where c1 is Utf8 let list = vec![lit("a"), lit(1i64)]; let logical_plan = test_csv_scan() .await? - // filter clause needs the type coercion rule applied .filter(col("c12").lt(lit(0.05)))? .project(vec![col("c1").in_list(list, false)])? .build()?; - let execution_plan = plan(&logical_plan).await?; - // verify that the plan correctly adds cast from Int64(1) to Utf8, and the const will be evaluated. - - let expected = r#"expr: BinaryExpr { left: BinaryExpr { left: Column { name: "c1", index: 0 }, op: Eq, right: Literal { value: Utf8("a"), field: Field { name: "lit", data_type: Utf8 } }, fail_on_overflow: false }"#; + let e = plan(&logical_plan).await.unwrap_err().to_string(); - assert_contains!(format!("{execution_plan:?}"), expected); + assert_contains!(&e, "Cannot cast string 'a' to value of Int64 type"); Ok(()) } diff --git a/datafusion/core/tests/expr_api/mod.rs b/datafusion/core/tests/expr_api/mod.rs index 91dd5de7fcd64..bd2288ae9ebae 100644 --- a/datafusion/core/tests/expr_api/mod.rs +++ b/datafusion/core/tests/expr_api/mod.rs @@ -342,20 +342,25 @@ fn test_create_physical_expr_nvl2() { #[tokio::test] async fn test_create_physical_expr_coercion() { - // create_physical_expr does apply type coercion and unwrapping in cast + // create_physical_expr applies type coercion (and can unwrap/fold + // literal casts). Comparison coercion prefers numeric types, so + // string/int comparisons cast the string side to the numeric type. // - // expect the cast on the literals - // compare string function to int `id = 1` - create_expr_test(col("id").eq(lit(1i32)), "id@0 = CAST(1 AS Utf8)"); - create_expr_test(lit(1i32).eq(col("id")), "CAST(1 AS Utf8) = id@0"); - // compare int col to string literal `i = '202410'` - // Note this casts the column (not the field) - create_expr_test(col("i").eq(lit("202410")), "CAST(i@1 AS Utf8) = 202410"); - create_expr_test(lit("202410").eq(col("i")), "202410 = CAST(i@1 AS Utf8)"); - // however, when simplified the casts on i should removed - // https://github.com/apache/datafusion/issues/14944 - create_simplified_expr_test(col("i").eq(lit("202410")), "CAST(i@1 AS Utf8) = 202410"); - create_simplified_expr_test(lit("202410").eq(col("i")), "CAST(i@1 AS Utf8) = 202410"); + // string column vs int literal: id (Utf8) is cast to Int32 + create_expr_test(col("id").eq(lit(1i32)), "CAST(id@0 AS Int32) = 1"); + create_expr_test(lit(1i32).eq(col("id")), "1 = CAST(id@0 AS Int32)"); + // int column vs string literal: the string literal is cast to Int64 + create_expr_test(col("i").eq(lit("202410")), "i@1 = CAST(202410 AS Int64)"); + create_expr_test(lit("202410").eq(col("i")), "CAST(202410 AS Int64) = i@1"); + // when simplified, the literal cast is constant-folded + create_simplified_expr_test( + col("i").eq(lit("202410")), + "i@1 = CAST(202410 AS Int64)", + ); + create_simplified_expr_test( + lit("202410").eq(col("i")), + "i@1 = CAST(202410 AS Int64)", + ); } /// Evaluates the specified expr as an aggregate and compares the result to the diff --git a/datafusion/core/tests/sql/unparser.rs b/datafusion/core/tests/sql/unparser.rs index ab1015b2d18d9..47d30871c268b 100644 --- a/datafusion/core/tests/sql/unparser.rs +++ b/datafusion/core/tests/sql/unparser.rs @@ -143,16 +143,26 @@ fn tpch_queries() -> Vec { } /// Create a new SessionContext for testing that has all Clickbench tables registered. +/// +/// Registers the raw Parquet as `hits_raw`, then creates a `hits` view that +/// casts `EventDate` from UInt16 (day-offset) to DATE. This mirrors the +/// approach used by the benchmark runner in `benchmarks/src/clickbench.rs`. async fn clickbench_test_context() -> Result { let ctx = SessionContext::new(); ctx.register_parquet( - "hits", + "hits_raw", "tests/data/clickbench_hits_10.parquet", ParquetReadOptions::default(), ) .await?; - // Sanity check we found the table by querying it's schema, it should not be empty - // Otherwise if the path is wrong the tests will all fail in confusing ways + ctx.sql( + r#"CREATE VIEW hits AS + SELECT * EXCEPT ("EventDate"), + CAST(CAST("EventDate" AS INTEGER) AS DATE) AS "EventDate" + FROM hits_raw"#, + ) + .await?; + // Sanity check we found the table by querying its schema let df = ctx.sql("SELECT * FROM hits LIMIT 1").await?; assert!( !df.schema().fields().is_empty(), diff --git a/datafusion/expr-common/src/interval_arithmetic.rs b/datafusion/expr-common/src/interval_arithmetic.rs index f93ef3b79595b..fb4f1f37b8ced 100644 --- a/datafusion/expr-common/src/interval_arithmetic.rs +++ b/datafusion/expr-common/src/interval_arithmetic.rs @@ -22,7 +22,7 @@ use std::fmt::{self, Display, Formatter}; use std::ops::{AddAssign, SubAssign}; use crate::operator::Operator; -use crate::type_coercion::binary::{BinaryTypeCoercer, comparison_coercion_numeric}; +use crate::type_coercion::binary::{BinaryTypeCoercer, comparison_coercion}; use arrow::compute::{CastOptions, cast_with_options}; use arrow::datatypes::{ @@ -734,7 +734,7 @@ impl Interval { (self.lower.clone(), self.upper.clone(), rhs.clone()) } else { let maybe_common_type = - comparison_coercion_numeric(&self.data_type(), &rhs.data_type()); + comparison_coercion(&self.data_type(), &rhs.data_type()); assert_or_internal_err!( maybe_common_type.is_some(), "Data types must be compatible for containment checks, lhs:{}, rhs:{}", diff --git a/datafusion/expr-common/src/signature.rs b/datafusion/expr-common/src/signature.rs index 82759be9f75e8..42d4de939a8e8 100644 --- a/datafusion/expr-common/src/signature.rs +++ b/datafusion/expr-common/src/signature.rs @@ -158,7 +158,7 @@ pub enum Arity { pub enum TypeSignature { /// One or more arguments of a common type out of a list of valid types. /// - /// For functions that take no arguments (e.g. `random()` see [`TypeSignature::Nullary`]). + /// For functions that take no arguments (e.g. `random()`), see [`TypeSignature::Nullary`]. /// /// # Examples /// @@ -184,7 +184,7 @@ pub enum TypeSignature { Uniform(usize, Vec), /// One or more arguments with exactly the specified types in order. /// - /// For functions that take no arguments (e.g. `random()`) use [`TypeSignature::Nullary`]. + /// For functions that take no arguments (e.g. `random()`), use [`TypeSignature::Nullary`]. Exact(Vec), /// One or more arguments belonging to the [`TypeSignatureClass`], in order. /// @@ -192,12 +192,12 @@ pub enum TypeSignature { /// casts. For example, if you expect a function has string type, but you /// also allow it to be casted from binary type. /// - /// For functions that take no arguments (e.g. `random()`) see [`TypeSignature::Nullary`]. + /// For functions that take no arguments (e.g. `random()`), see [`TypeSignature::Nullary`]. Coercible(Vec), /// One or more arguments coercible to a single, comparable type. /// /// Each argument will be coerced to a single type using the - /// coercion rules described in [`comparison_coercion_numeric`]. + /// coercion rules described in [`comparison_coercion`]. /// /// # Examples /// @@ -205,17 +205,18 @@ pub enum TypeSignature { /// the types will both be coerced to `i64` before the function is invoked. /// /// If the `nullif('1', 2)` function is called with `Utf8` and `i64` arguments - /// the types will both be coerced to `Utf8` before the function is invoked. + /// the types will both be coerced to `Int64` before the function is invoked + /// (numeric is preferred over string). /// /// Note: - /// - For functions that take no arguments (e.g. `random()` see [`TypeSignature::Nullary`]). + /// - For functions that take no arguments (e.g. `random()`), see [`TypeSignature::Nullary`]. /// - If all arguments have type [`DataType::Null`], they are coerced to `Utf8` /// - /// [`comparison_coercion_numeric`]: crate::type_coercion::binary::comparison_coercion_numeric + /// [`comparison_coercion`]: crate::type_coercion::binary::comparison_coercion Comparable(usize), /// One or more arguments of arbitrary types. /// - /// For functions that take no arguments (e.g. `random()`) use [`TypeSignature::Nullary`]. + /// For functions that take no arguments (e.g. `random()`), use [`TypeSignature::Nullary`]. Any(usize), /// Matches exactly one of a list of [`TypeSignature`]s. /// @@ -233,7 +234,7 @@ pub enum TypeSignature { /// /// See [`NativeType::is_numeric`] to know which type is considered numeric /// - /// For functions that take no arguments (e.g. `random()`) use [`TypeSignature::Nullary`]. + /// For functions that take no arguments (e.g. `random()`), use [`TypeSignature::Nullary`]. /// /// [`NativeType::is_numeric`]: datafusion_common::types::NativeType::is_numeric Numeric(usize), @@ -246,7 +247,7 @@ pub enum TypeSignature { /// For example, if a function is called with (utf8, large_utf8), all /// arguments will be coerced to `LargeUtf8` /// - /// For functions that take no arguments (e.g. `random()` use [`TypeSignature::Nullary`]). + /// For functions that take no arguments (e.g. `random()`), use [`TypeSignature::Nullary`]. String(usize), /// No arguments Nullary, diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index fa109e38a4382..a6e36a7e4eeac 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -33,7 +33,6 @@ use arrow::datatypes::{ DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DataType, Field, FieldRef, Fields, TimeUnit, }; -use datafusion_common::types::NativeType; use datafusion_common::{ Diagnostic, Result, Span, Spans, exec_err, internal_err, not_impl_err, plan_datafusion_err, plan_err, @@ -748,7 +747,7 @@ fn type_union_resolution_coercion( .or_else(|| list_coercion(lhs_type, rhs_type)) .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type)) .or_else(|| string_coercion(lhs_type, rhs_type)) - .or_else(|| numeric_string_coercion(lhs_type, rhs_type)) + .or_else(|| string_numeric_coercion(lhs_type, rhs_type)) .or_else(|| binary_coercion(lhs_type, rhs_type)) } } @@ -843,102 +842,104 @@ pub fn try_type_union_resolution_with_struct( Ok(final_struct_types) } -/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a -/// comparison operation +/// Coerce `lhs_type` and `rhs_type` to a common type for type unification +/// contexts — where two values must be brought to a common type but are not +/// being compared. Examples: UNION, CASE THEN/ELSE branches, NVL2. For other +/// contexts, [`comparison_coercion`] should typically be used instead. /// -/// Example comparison operations are `lhs = rhs` and `lhs > rhs` -/// -/// Binary comparison kernels require the two arguments to be the (exact) same -/// data type. However, users can write queries where the two arguments are -/// different data types. In such cases, the data types are automatically cast -/// (coerced) to a single data type to pass to the kernels. -/// -/// # Numeric comparisons -/// -/// When comparing numeric values, the lower precision type is coerced to the -/// higher precision type to avoid losing data. For example when comparing -/// `Int32` to `Int64` the coerced type is `Int64` so the `Int32` argument will -/// be cast. -/// -/// # Numeric / String comparisons -/// -/// When comparing numeric values and strings, both values will be coerced to -/// strings. For example when comparing `'2' > 1`, the arguments will be -/// coerced to `Utf8` for comparison -pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { +/// The intuition is that we try to find the "widest" type that can represent +/// all values from both sides. When one side is a string and the other is +/// numeric, this prefers strings because every number has a textual +/// representation but not every string can be parsed as a number (e.g., `SELECT +/// 1 UNION SELECT 'a'` coerces both sides to a string). This is in contrast to +/// [`comparison_coercion`], which prefers numeric types so that ordering and +/// equality follow numeric semantics. +pub fn type_union_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { if lhs_type.equals_datatype(rhs_type) { - // same type => equality is possible return Some(lhs_type.clone()); } binary_numeric_coercion(lhs_type, rhs_type) - .or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, true)) - .or_else(|| ree_comparison_coercion(lhs_type, rhs_type, true)) + .or_else(|| dictionary_coercion(lhs_type, rhs_type, true, type_union_coercion)) + .or_else(|| ree_coercion(lhs_type, rhs_type, true, type_union_coercion)) .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type)) .or_else(|| string_coercion(lhs_type, rhs_type)) .or_else(|| list_coercion(lhs_type, rhs_type)) .or_else(|| null_coercion(lhs_type, rhs_type)) - .or_else(|| string_numeric_coercion(lhs_type, rhs_type)) + .or_else(|| string_numeric_union_coercion(lhs_type, rhs_type)) .or_else(|| string_temporal_coercion(lhs_type, rhs_type)) .or_else(|| binary_coercion(lhs_type, rhs_type)) - .or_else(|| struct_coercion(lhs_type, rhs_type)) - .or_else(|| map_coercion(lhs_type, rhs_type)) + .or_else(|| struct_coercion(lhs_type, rhs_type, type_union_coercion)) + .or_else(|| map_coercion(lhs_type, rhs_type, type_union_coercion)) } -/// Similar to [`comparison_coercion`] but prefers numeric if compares with -/// numeric and string +/// Coerce `lhs_type` and `rhs_type` to a common type for comparison +/// contexts — any context where two values are compared rather than +/// unified. This includes binary comparison operators, IN lists, +/// CASE/WHEN conditions, and BETWEEN. +/// +/// When the two types differ, this function determines the common type +/// to cast to. /// /// # Numeric comparisons /// -/// When comparing numeric values and strings, the values will be coerced to the -/// numeric type. For example, `'2' > 1` if `1` is an `Int32`, the arguments -/// will be coerced to `Int32`. -pub fn comparison_coercion_numeric( - lhs_type: &DataType, - rhs_type: &DataType, -) -> Option { - if lhs_type == rhs_type { +/// The lower precision type is widened to the higher precision type +/// (e.g., `Int32` vs `Int64` → `Int64`). +/// +/// # Numeric / String comparisons +/// +/// Prefers the numeric type (e.g., `'2' > 1` where `1` is `Int32` coerces +/// `'2'` to `Int32`). +/// +/// For type unification contexts (UNION, CASE THEN/ELSE), use +/// [`type_union_coercion`] instead. +pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { + if lhs_type.equals_datatype(rhs_type) { // same type => equality is possible return Some(lhs_type.clone()); } binary_numeric_coercion(lhs_type, rhs_type) - .or_else(|| dictionary_comparison_coercion_numeric(lhs_type, rhs_type, true)) - .or_else(|| ree_comparison_coercion_numeric(lhs_type, rhs_type, true)) + .or_else(|| dictionary_coercion(lhs_type, rhs_type, true, comparison_coercion)) + .or_else(|| ree_coercion(lhs_type, rhs_type, true, comparison_coercion)) + .or_else(|| temporal_coercion_nonstrict_timezone(lhs_type, rhs_type)) .or_else(|| string_coercion(lhs_type, rhs_type)) + .or_else(|| list_coercion(lhs_type, rhs_type)) .or_else(|| null_coercion(lhs_type, rhs_type)) - .or_else(|| string_numeric_coercion_as_numeric(lhs_type, rhs_type)) + .or_else(|| string_numeric_coercion(lhs_type, rhs_type)) + .or_else(|| string_temporal_coercion(lhs_type, rhs_type)) + .or_else(|| binary_coercion(lhs_type, rhs_type)) + .or_else(|| struct_coercion(lhs_type, rhs_type, comparison_coercion)) + .or_else(|| map_coercion(lhs_type, rhs_type, comparison_coercion)) } -/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation -/// where one is numeric and one is `Utf8`/`LargeUtf8`. +/// Coerce a numeric/string pair to the numeric type. +/// +/// Used by [`comparison_coercion`]. fn string_numeric_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { - (Utf8, _) if rhs_type.is_numeric() => Some(Utf8), - (LargeUtf8, _) if rhs_type.is_numeric() => Some(LargeUtf8), - (Utf8View, _) if rhs_type.is_numeric() => Some(Utf8View), - (_, Utf8) if lhs_type.is_numeric() => Some(Utf8), - (_, LargeUtf8) if lhs_type.is_numeric() => Some(LargeUtf8), - (_, Utf8View) if lhs_type.is_numeric() => Some(Utf8View), + (lhs, Utf8 | LargeUtf8 | Utf8View) if lhs.is_numeric() => Some(lhs.clone()), + (Utf8 | LargeUtf8 | Utf8View, rhs) if rhs.is_numeric() => Some(rhs.clone()), _ => None, } } -/// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation -/// where one is numeric and one is `Utf8`/`LargeUtf8`. -fn string_numeric_coercion_as_numeric( +/// Coerce a numeric/string pair to the string type. +/// +/// Used by [`type_union_coercion`]. +fn string_numeric_union_coercion( lhs_type: &DataType, rhs_type: &DataType, ) -> Option { - let lhs_logical_type = NativeType::from(lhs_type); - let rhs_logical_type = NativeType::from(rhs_type); - if lhs_logical_type.is_numeric() && rhs_logical_type == NativeType::String { - return Some(lhs_type.to_owned()); - } - if rhs_logical_type.is_numeric() && lhs_logical_type == NativeType::String { - return Some(rhs_type.to_owned()); + use arrow::datatypes::DataType::*; + match (lhs_type, rhs_type) { + (lhs @ (Utf8 | LargeUtf8 | Utf8View), _) if rhs_type.is_numeric() => { + Some(lhs.clone()) + } + (_, rhs @ (Utf8 | LargeUtf8 | Utf8View)) if lhs_type.is_numeric() => { + Some(rhs.clone()) + } + _ => None, } - - None } /// Coerce `lhs_type` and `rhs_type` to a common type for the purposes of a comparison operation @@ -1230,7 +1231,13 @@ fn coerce_numeric_type_to_decimal256(numeric_type: &DataType) -> Option Option { +/// Coerce two struct types by recursively coercing their fields using +/// `coerce_fn` (either [`comparison_coercion`] or [`type_union_coercion`]). +fn struct_coercion( + lhs_type: &DataType, + rhs_type: &DataType, + coerce_fn: fn(&DataType, &DataType) -> Option, +) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { @@ -1254,10 +1261,10 @@ fn struct_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option // // See docs/source/user-guide/sql/struct_coercion.md for detailed examples. if fields_have_same_names(lhs_fields, rhs_fields) { - return coerce_struct_by_name(lhs_fields, rhs_fields); + return coerce_struct_by_name(lhs_fields, rhs_fields, coerce_fn); } - coerce_struct_by_position(lhs_fields, rhs_fields) + coerce_struct_by_position(lhs_fields, rhs_fields, coerce_fn) } _ => None, } @@ -1300,8 +1307,13 @@ fn fields_have_same_names(lhs_fields: &Fields, rhs_fields: &Fields) -> bool { .all(|lf| rhs_names.contains(lf.name().as_str())) } -/// Coerce two structs by matching fields by name. Assumes the name-sets match. -fn coerce_struct_by_name(lhs_fields: &Fields, rhs_fields: &Fields) -> Option { +/// Coerce two structs by matching fields by name using `coerce_fn`. +/// Assumes the name-sets match. +fn coerce_struct_by_name( + lhs_fields: &Fields, + rhs_fields: &Fields, + coerce_fn: fn(&DataType, &DataType) -> Option, +) -> Option { use arrow::datatypes::DataType::*; let rhs_by_name: HashMap<&str, &FieldRef> = @@ -1311,7 +1323,7 @@ fn coerce_struct_by_name(lhs_fields: &Fields, rhs_fields: &Fields) -> Option Option Option, ) -> Option { use arrow::datatypes::DataType::*; @@ -1335,7 +1348,7 @@ fn coerce_struct_by_position( let coerced_types: Vec = lhs_fields .iter() .zip(rhs_fields.iter()) - .map(|(l, r)| comparison_coercion(l.data_type(), r.data_type())) + .map(|(l, r)| coerce_fn(l.data_type(), r.data_type())) .collect::>>()?; // Build final fields preserving left-side names and combined nullability. @@ -1356,13 +1369,17 @@ fn coerce_fields(common_type: DataType, lhs: &FieldRef, rhs: &FieldRef) -> Field Arc::new(Field::new(name, common_type, is_nullable)) } -/// coerce two types if they are Maps by coercing their inner 'entries' fields' types -/// using struct coercion -fn map_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { +/// Coerce two Map types by coercing their inner entry fields using +/// `coerce_fn` (either [`comparison_coercion`] or [`type_union_coercion`]). +fn map_coercion( + lhs_type: &DataType, + rhs_type: &DataType, + coerce_fn: fn(&DataType, &DataType) -> Option, +) -> Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { (Map(lhs_field, lhs_ordered), Map(rhs_field, rhs_ordered)) => { - struct_coercion(lhs_field.data_type(), rhs_field.data_type()).map( + struct_coercion(lhs_field.data_type(), rhs_field.data_type(), coerce_fn).map( |key_value_type| { Map( Arc::new((**lhs_field).clone().with_data_type(key_value_type)), @@ -1483,15 +1500,12 @@ fn both_numeric_or_null_and_numeric(lhs_type: &DataType, rhs_type: &DataType) -> } } -/// Generic coercion rules for Dictionaries: the type that both lhs and rhs -/// can be casted to for the purpose of a computation. -/// -/// Not all operators support dictionaries, if `preserve_dictionaries` is true -/// dictionaries will be preserved if possible. +/// Coerce two Dictionary types by coercing their value types using +/// `coerce_fn` (either [`comparison_coercion`] or [`type_union_coercion`]). /// -/// The `coerce_fn` parameter determines which comparison coercion function to use -/// for comparing the dictionary value types. -fn dictionary_comparison_coercion_generic( +/// If `preserve_dictionaries` is true, dictionaries will be preserved +/// when possible. +fn dictionary_coercion( lhs_type: &DataType, rhs_type: &DataType, preserve_dictionaries: bool, @@ -1515,52 +1529,11 @@ fn dictionary_comparison_coercion_generic( } } -/// Coercion rules for Dictionaries: the type that both lhs and rhs -/// can be casted to for the purpose of a computation. +/// Coerce two RunEndEncoded types using `coerce_fn` +/// (either [`comparison_coercion`] or [`type_union_coercion`]). /// -/// Not all operators support dictionaries, if `preserve_dictionaries` is true -/// dictionaries will be preserved if possible -fn dictionary_comparison_coercion( - lhs_type: &DataType, - rhs_type: &DataType, - preserve_dictionaries: bool, -) -> Option { - dictionary_comparison_coercion_generic( - lhs_type, - rhs_type, - preserve_dictionaries, - comparison_coercion, - ) -} - -/// Coercion rules for Dictionaries with numeric preference: similar to -/// [`dictionary_comparison_coercion`] but uses [`comparison_coercion_numeric`] -/// which prefers numeric types over strings when both are present. -/// -/// This is used by [`comparison_coercion_numeric`] to maintain consistent -/// numeric-preferring semantics when dealing with dictionary types. -fn dictionary_comparison_coercion_numeric( - lhs_type: &DataType, - rhs_type: &DataType, - preserve_dictionaries: bool, -) -> Option { - dictionary_comparison_coercion_generic( - lhs_type, - rhs_type, - preserve_dictionaries, - comparison_coercion_numeric, - ) -} - -/// Coercion rules for RunEndEncoded: the type that both lhs and rhs -/// can be casted to for the purpose of a computation. -/// -/// Not all operators support REE, if `preserve_ree` is true -/// REE will be preserved if possible -/// -/// The `coerce_fn` parameter determines which comparison coercion function to use -/// for comparing the REE value types. -fn ree_comparison_coercion_generic( +/// If `preserve_ree` is true, REE will be preserved when possible. +fn ree_coercion( lhs_type: &DataType, rhs_type: &DataType, preserve_ree: bool, @@ -1587,38 +1560,6 @@ fn ree_comparison_coercion_generic( } } -/// Coercion rules for RunEndEncoded: the type that both lhs and rhs -/// can be casted to for the purpose of a computation. -/// -/// Not all operators support REE, if `preserve_ree` is true -/// REE will be preserved if possible -fn ree_comparison_coercion( - lhs_type: &DataType, - rhs_type: &DataType, - preserve_ree: bool, -) -> Option { - ree_comparison_coercion_generic(lhs_type, rhs_type, preserve_ree, comparison_coercion) -} - -/// Coercion rules for RunEndEncoded with numeric preference: similar to -/// [`ree_comparison_coercion`] but uses [`comparison_coercion_numeric`] -/// which prefers numeric types over strings when both are present. -/// -/// This is used by [`comparison_coercion_numeric`] to maintain consistent -/// numeric-preferring semantics when dealing with REE types. -fn ree_comparison_coercion_numeric( - lhs_type: &DataType, - rhs_type: &DataType, - preserve_ree: bool, -) -> Option { - ree_comparison_coercion_generic( - lhs_type, - rhs_type, - preserve_ree, - comparison_coercion_numeric, - ) -} - /// Coercion rules for string concat. /// This is a union of string coercion rules and specified rules: /// 1. At least one side of lhs and rhs should be string type (Utf8 / LargeUtf8) @@ -1683,20 +1624,8 @@ pub fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { - use arrow::datatypes::DataType::*; - match (lhs_type, rhs_type) { - (Utf8 | LargeUtf8 | Utf8View, other_type) - | (other_type, Utf8 | LargeUtf8 | Utf8View) - if other_type.is_numeric() => - { - Some(other_type.clone()) - } - _ => None, - } -} - -/// Coerces two fields together, ensuring the field data (name and nullability) is correctly set. +/// Coerce two list element fields to a common type via +/// [`type_union_resolution`]. fn coerce_list_children(lhs_field: &FieldRef, rhs_field: &FieldRef) -> Option { let data_types = vec![lhs_field.data_type().clone(), rhs_field.data_type().clone()]; Some(Arc::new( @@ -1707,7 +1636,8 @@ fn coerce_list_children(lhs_field: &FieldRef, rhs_field: &FieldRef) -> Option Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { @@ -1803,8 +1733,8 @@ fn binary_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option pub fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { string_coercion(lhs_type, rhs_type) .or_else(|| binary_to_string_coercion(lhs_type, rhs_type)) - .or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, false)) - .or_else(|| ree_comparison_coercion(lhs_type, rhs_type, false)) + .or_else(|| dictionary_coercion(lhs_type, rhs_type, false, string_coercion)) + .or_else(|| ree_coercion(lhs_type, rhs_type, false, string_coercion)) .or_else(|| regex_null_coercion(lhs_type, rhs_type)) .or_else(|| null_coercion(lhs_type, rhs_type)) } @@ -1821,10 +1751,11 @@ fn regex_null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { string_coercion(lhs_type, rhs_type) - .or_else(|| dictionary_comparison_coercion(lhs_type, rhs_type, false)) + .or_else(|| dictionary_coercion(lhs_type, rhs_type, false, string_coercion)) + .or_else(|| ree_coercion(lhs_type, rhs_type, false, string_coercion)) .or_else(|| regex_null_coercion(lhs_type, rhs_type)) } diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs b/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs index 5d1b3bea75b0a..166b308710e0e 100644 --- a/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs +++ b/datafusion/expr-common/src/type_coercion/binary/tests/comparison.rs @@ -791,3 +791,109 @@ fn test_decimal_cross_variant_comparison_coercion() -> Result<()> { Ok(()) } + +/// Tests that `comparison_coercion` prefers the numeric type when one side is +/// numeric and the other is a string (e.g., `numeric_col < '123'`). +#[test] +fn test_comparison_coercion_prefers_numeric() { + assert_eq!( + comparison_coercion(&DataType::Int32, &DataType::Utf8), + Some(DataType::Int32) + ); + assert_eq!( + comparison_coercion(&DataType::Utf8, &DataType::Int32), + Some(DataType::Int32) + ); + assert_eq!( + comparison_coercion(&DataType::Utf8, &DataType::Float64), + Some(DataType::Float64) + ); + assert_eq!( + comparison_coercion(&DataType::Float64, &DataType::Utf8), + Some(DataType::Float64) + ); + assert_eq!( + comparison_coercion(&DataType::Int64, &DataType::LargeUtf8), + Some(DataType::Int64) + ); + assert_eq!( + comparison_coercion(&DataType::Utf8View, &DataType::Int16), + Some(DataType::Int16) + ); + // String-string stays string + assert_eq!( + comparison_coercion(&DataType::Utf8, &DataType::Utf8), + Some(DataType::Utf8) + ); + // Numeric-numeric stays numeric + assert_eq!( + comparison_coercion(&DataType::Int32, &DataType::Int64), + Some(DataType::Int64) + ); +} + +/// Tests that `type_union_coercion` prefers the string type when unifying +/// numeric and string types (for UNION, CASE THEN/ELSE, etc.). +#[test] +fn test_type_union_coercion_prefers_string() { + assert_eq!( + type_union_coercion(&DataType::Int32, &DataType::Utf8), + Some(DataType::Utf8) + ); + assert_eq!( + type_union_coercion(&DataType::Utf8, &DataType::Int32), + Some(DataType::Utf8) + ); + assert_eq!( + type_union_coercion(&DataType::Float64, &DataType::Utf8), + Some(DataType::Utf8) + ); + assert_eq!( + type_union_coercion(&DataType::Utf8, &DataType::Float64), + Some(DataType::Utf8) + ); + assert_eq!( + type_union_coercion(&DataType::Int64, &DataType::LargeUtf8), + Some(DataType::LargeUtf8) + ); + assert_eq!( + type_union_coercion(&DataType::Utf8View, &DataType::Int16), + Some(DataType::Utf8View) + ); + // String-string stays string + assert_eq!( + type_union_coercion(&DataType::Utf8, &DataType::Utf8), + Some(DataType::Utf8) + ); + // Numeric-numeric stays numeric + assert_eq!( + type_union_coercion(&DataType::Int32, &DataType::Int64), + Some(DataType::Int64) + ); +} + +/// Tests that comparison operators coerce to numeric when comparing +/// numeric and string types. +#[test] +fn test_binary_comparison_string_numeric_coercion() -> Result<()> { + let comparison_ops = [ + Operator::Eq, + Operator::NotEq, + Operator::Lt, + Operator::LtEq, + Operator::Gt, + Operator::GtEq, + ]; + for op in &comparison_ops { + let (lhs, rhs) = BinaryTypeCoercer::new(&DataType::Int64, op, &DataType::Utf8) + .get_input_types()?; + assert_eq!(lhs, DataType::Int64, "Op {op}: Int64 vs Utf8 -> lhs"); + assert_eq!(rhs, DataType::Int64, "Op {op}: Int64 vs Utf8 -> rhs"); + + let (lhs, rhs) = BinaryTypeCoercer::new(&DataType::Utf8, op, &DataType::Float64) + .get_input_types()?; + assert_eq!(lhs, DataType::Float64, "Op {op}: Utf8 vs Float64 -> lhs"); + assert_eq!(rhs, DataType::Float64, "Op {op}: Utf8 vs Float64 -> rhs"); + } + Ok(()) +} diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/dictionary.rs b/datafusion/expr-common/src/type_coercion/binary/tests/dictionary.rs index 0fb56a4a2c536..f0aadfd3ce3a5 100644 --- a/datafusion/expr-common/src/type_coercion/binary/tests/dictionary.rs +++ b/datafusion/expr-common/src/type_coercion/binary/tests/dictionary.rs @@ -24,49 +24,49 @@ fn test_dictionary_type_coercion() { let lhs_type = Dictionary(Box::new(Int8), Box::new(Int32)); let rhs_type = Dictionary(Box::new(Int8), Box::new(Int16)); assert_eq!( - dictionary_comparison_coercion(&lhs_type, &rhs_type, true), + dictionary_coercion(&lhs_type, &rhs_type, true, comparison_coercion), Some(Int32) ); assert_eq!( - dictionary_comparison_coercion(&lhs_type, &rhs_type, false), + dictionary_coercion(&lhs_type, &rhs_type, false, comparison_coercion), Some(Int32) ); - // Since we can coerce values of Int16 to Utf8 can support this + // In comparison context, numeric is preferred over string let lhs_type = Dictionary(Box::new(Int8), Box::new(Utf8)); let rhs_type = Dictionary(Box::new(Int8), Box::new(Int16)); assert_eq!( - dictionary_comparison_coercion(&lhs_type, &rhs_type, true), - Some(Utf8) + dictionary_coercion(&lhs_type, &rhs_type, true, comparison_coercion), + Some(Int16) ); // Since we can coerce values of Utf8 to Binary can support this let lhs_type = Dictionary(Box::new(Int8), Box::new(Utf8)); let rhs_type = Dictionary(Box::new(Int8), Box::new(Binary)); assert_eq!( - dictionary_comparison_coercion(&lhs_type, &rhs_type, true), + dictionary_coercion(&lhs_type, &rhs_type, true, comparison_coercion), Some(Binary) ); let lhs_type = Dictionary(Box::new(Int8), Box::new(Utf8)); let rhs_type = Utf8; assert_eq!( - dictionary_comparison_coercion(&lhs_type, &rhs_type, false), + dictionary_coercion(&lhs_type, &rhs_type, false, comparison_coercion), Some(Utf8) ); assert_eq!( - dictionary_comparison_coercion(&lhs_type, &rhs_type, true), + dictionary_coercion(&lhs_type, &rhs_type, true, comparison_coercion), Some(lhs_type.clone()) ); let lhs_type = Utf8; let rhs_type = Dictionary(Box::new(Int8), Box::new(Utf8)); assert_eq!( - dictionary_comparison_coercion(&lhs_type, &rhs_type, false), + dictionary_coercion(&lhs_type, &rhs_type, false, comparison_coercion), Some(Utf8) ); assert_eq!( - dictionary_comparison_coercion(&lhs_type, &rhs_type, true), + dictionary_coercion(&lhs_type, &rhs_type, true, comparison_coercion), Some(rhs_type.clone()) ); } diff --git a/datafusion/expr-common/src/type_coercion/binary/tests/run_end_encoded.rs b/datafusion/expr-common/src/type_coercion/binary/tests/run_end_encoded.rs index 9997db7a82688..dab42214d755c 100644 --- a/datafusion/expr-common/src/type_coercion/binary/tests/run_end_encoded.rs +++ b/datafusion/expr-common/src/type_coercion/binary/tests/run_end_encoded.rs @@ -30,15 +30,15 @@ fn test_ree_type_coercion() { Arc::new(Field::new("values", Int16, false)), ); assert_eq!( - ree_comparison_coercion(&lhs_type, &rhs_type, true), + ree_coercion(&lhs_type, &rhs_type, true, comparison_coercion), Some(Int32) ); assert_eq!( - ree_comparison_coercion(&lhs_type, &rhs_type, false), + ree_coercion(&lhs_type, &rhs_type, false, comparison_coercion), Some(Int32) ); - // Since we can coerce values of Int16 to Utf8 can support this: Coercion of Int16 to Utf8 + // In comparison context, numeric is preferred over string let lhs_type = RunEndEncoded( Arc::new(Field::new("run_ends", Int8, false)), Arc::new(Field::new("values", Utf8, false)), @@ -48,8 +48,8 @@ fn test_ree_type_coercion() { Arc::new(Field::new("values", Int16, false)), ); assert_eq!( - ree_comparison_coercion(&lhs_type, &rhs_type, true), - Some(Utf8) + ree_coercion(&lhs_type, &rhs_type, true, comparison_coercion), + Some(Int16) ); // Since we can coerce values of Utf8 to Binary can support this @@ -62,7 +62,7 @@ fn test_ree_type_coercion() { Arc::new(Field::new("values", Binary, false)), ); assert_eq!( - ree_comparison_coercion(&lhs_type, &rhs_type, true), + ree_coercion(&lhs_type, &rhs_type, true, comparison_coercion), Some(Binary) ); let lhs_type = RunEndEncoded( @@ -72,12 +72,12 @@ fn test_ree_type_coercion() { let rhs_type = Utf8; // Don't preserve REE assert_eq!( - ree_comparison_coercion(&lhs_type, &rhs_type, false), + ree_coercion(&lhs_type, &rhs_type, false, comparison_coercion), Some(Utf8) ); // Preserve REE assert_eq!( - ree_comparison_coercion(&lhs_type, &rhs_type, true), + ree_coercion(&lhs_type, &rhs_type, true, comparison_coercion), Some(lhs_type.clone()) ); @@ -88,12 +88,12 @@ fn test_ree_type_coercion() { ); // Don't preserve REE assert_eq!( - ree_comparison_coercion(&lhs_type, &rhs_type, false), + ree_coercion(&lhs_type, &rhs_type, false, comparison_coercion), Some(Utf8) ); // Preserve REE assert_eq!( - ree_comparison_coercion(&lhs_type, &rhs_type, true), + ree_coercion(&lhs_type, &rhs_type, true, comparison_coercion), Some(rhs_type.clone()) ); } diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index a79f2c66830bf..be097f3a8520f 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -33,7 +33,7 @@ use datafusion_expr_common::signature::ArrayFunctionArgument; use datafusion_expr_common::type_coercion::binary::type_union_resolution; use datafusion_expr_common::{ signature::{ArrayFunctionSignature, FIXED_SIZE_LIST_WILDCARD, TIMEZONE_WILDCARD}, - type_coercion::binary::comparison_coercion_numeric, + type_coercion::binary::comparison_coercion, type_coercion::binary::string_coercion, }; use itertools::Itertools as _; @@ -593,7 +593,7 @@ fn get_valid_types( function_length_check(function_name, current_types.len(), *num)?; let mut target_type = current_types[0].to_owned(); for data_type in current_types.iter().skip(1) { - if let Some(dt) = comparison_coercion_numeric(&target_type, data_type) { + if let Some(dt) = comparison_coercion(&target_type, data_type) { target_type = dt; } else { return plan_err!( diff --git a/datafusion/expr/src/type_coercion/other.rs b/datafusion/expr/src/type_coercion/other.rs index 634558094ae79..48125b661e2ca 100644 --- a/datafusion/expr/src/type_coercion/other.rs +++ b/datafusion/expr/src/type_coercion/other.rs @@ -17,38 +17,58 @@ use arrow::datatypes::DataType; -use super::binary::comparison_coercion; +use super::binary::{comparison_coercion, type_union_coercion}; + +/// Fold `coerce_fn` over `types`, starting from `initial_type`. +fn fold_coerce( + initial_type: &DataType, + types: &[DataType], + coerce_fn: fn(&DataType, &DataType) -> Option, +) -> Option { + types + .iter() + .try_fold(initial_type.clone(), |left_type, right_type| { + coerce_fn(&left_type, right_type) + }) +} /// Attempts to coerce the types of `list_types` to be comparable with the -/// `expr_type`. -/// Returns the common data type for `expr_type` and `list_types` +/// `expr_type` for IN list predicates. +/// Returns the common data type for `expr_type` and `list_types`. +/// +/// Uses comparison coercion because `x IN (a, b)` is semantically equivalent +/// to `x = a OR x = b`. pub fn get_coerce_type_for_list( expr_type: &DataType, list_types: &[DataType], ) -> Option { - list_types - .iter() - .try_fold(expr_type.clone(), |left_type, right_type| { - comparison_coercion(&left_type, right_type) - }) + fold_coerce(expr_type, list_types, comparison_coercion) +} + +/// Find a common coerceable type for `CASE expr WHEN val1 WHEN val2 ...` +/// conditions. Returns the common type for `case_type` and all `when_types`. +/// +/// Uses comparison coercion because `CASE expr WHEN val` is semantically +/// equivalent to `expr = val`. +pub fn get_coerce_type_for_case_when( + when_types: &[DataType], + case_type: &DataType, +) -> Option { + fold_coerce(case_type, when_types, comparison_coercion) } -/// Find a common coerceable type for all `when_or_then_types` as well -/// and the `case_or_else_type`, if specified. -/// Returns the common data type for `when_or_then_types` and `case_or_else_type` +/// Find a common coerceable type for CASE THEN/ELSE result expressions. +/// Returns the common data type for `then_types` and `else_type`. +/// +/// Uses type union coercion because the result branches must be brought to a +/// common type (like UNION), not compared. pub fn get_coerce_type_for_case_expression( - when_or_then_types: &[DataType], - case_or_else_type: Option<&DataType>, + then_types: &[DataType], + else_type: Option<&DataType>, ) -> Option { - let case_or_else_type = match case_or_else_type { - None => when_or_then_types[0].clone(), - Some(data_type) => data_type.clone(), + let (initial_type, remaining) = match else_type { + None => then_types.split_first()?, + Some(data_type) => (data_type, then_types), }; - when_or_then_types - .iter() - .try_fold(case_or_else_type, |left_type, right_type| { - // TODO: now just use the `equal` coercion rule for case when. If find the issue, and - // refactor again. - comparison_coercion(&left_type, right_type) - }) + fold_coerce(initial_type, remaining, type_union_coercion) } diff --git a/datafusion/functions/src/core/nvl2.rs b/datafusion/functions/src/core/nvl2.rs index 0b092c44d502b..6be861b129122 100644 --- a/datafusion/functions/src/core/nvl2.rs +++ b/datafusion/functions/src/core/nvl2.rs @@ -22,7 +22,7 @@ use datafusion_expr::{ ScalarUDFImpl, Signature, Volatility, conditional_expressions::CaseBuilder, simplify::{ExprSimplifyResult, SimplifyContext}, - type_coercion::binary::comparison_coercion, + type_coercion::binary::type_union_coercion, }; use datafusion_macros::user_doc; @@ -133,11 +133,9 @@ impl ScalarUDFImpl for NVL2Func { [if_non_null, if_null] .iter() .try_fold(tested.clone(), |acc, x| { - // The coerced types found by `comparison_coercion` are not guaranteed to be - // coercible for the arguments. `comparison_coercion` returns more loose - // types that can be coerced to both `acc` and `x` for comparison purpose. - // See `maybe_data_types` for the actual coercion. - let coerced_type = comparison_coercion(&acc, x); + // `type_union_coercion` finds a loose common type; the actual + // coercion is done by `maybe_data_types`. + let coerced_type = type_union_coercion(&acc, x); if let Some(coerced_type) = coerced_type { Ok(coerced_type) } else { diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index efc9984acb9b0..baffa23258b60 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -41,11 +41,14 @@ use datafusion_expr::expr::{ use datafusion_expr::expr_rewriter::coerce_plan_expr_for_schema; use datafusion_expr::expr_schema::cast_subquery; use datafusion_expr::logical_plan::Subquery; -use datafusion_expr::type_coercion::binary::{comparison_coercion, like_coercion}; +use datafusion_expr::type_coercion::binary::{ + comparison_coercion, like_coercion, type_union_coercion, +}; use datafusion_expr::type_coercion::functions::{UDFCoercionExt, fields_with_udf}; use datafusion_expr::type_coercion::is_datetime; use datafusion_expr::type_coercion::other::{ - get_coerce_type_for_case_expression, get_coerce_type_for_list, + get_coerce_type_for_case_expression, get_coerce_type_for_case_when, + get_coerce_type_for_list, }; use datafusion_expr::utils::merge_schema; use datafusion_expr::{ @@ -1028,8 +1031,7 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result { .iter() .map(|(when, _then)| when.get_type(schema)) .collect::>>()?; - let coerced_type = - get_coerce_type_for_case_expression(&when_types, Some(case_type)); + let coerced_type = get_coerce_type_for_case_when(&when_types, case_type); coerced_type.ok_or_else(|| { plan_datafusion_err!( "Failed to coerce case ({case_type}) and when ({}) \ @@ -1107,7 +1109,7 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result { /// **Field-level metadata merging**: Later fields take precedence for duplicate metadata keys. /// /// **Type coercion precedence**: The coerced type is determined by iteratively applying -/// `comparison_coercion()` between the accumulated type and each new input's type. The +/// `type_union_coercion()` between the accumulated type and each new input's type. The /// result depends on type coercion rules, not input order. /// /// **Nullability merging**: Nullability is accumulated using logical OR (`||`). @@ -1130,7 +1132,7 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result { /// ``` /// /// **Precedence Summary**: -/// - **Datatypes**: Determined by `comparison_coercion()` rules, not input order +/// - **Datatypes**: Determined by `type_union_coercion()` rules, not input order /// - **Nullability**: Later inputs can add nullability but cannot remove it (logical OR) /// - **Metadata**: Later inputs take precedence for same keys (HashMap::extend semantics) pub fn coerce_union_schema(inputs: &[Arc]) -> Result { @@ -1180,7 +1182,7 @@ fn coerce_union_schema_with_schema( plan_schema.fields().iter() ) { let coerced_type = - comparison_coercion(union_datatype, plan_field.data_type()).ok_or_else( + type_union_coercion(union_datatype, plan_field.data_type()).ok_or_else( || { plan_datafusion_err!( "Incompatible inputs for Union: Previous inputs were \ @@ -2345,6 +2347,9 @@ mod test { let actual = coerce_case_expression(case, &schema)?; assert_eq!(expected, actual); + // CASE string WHEN float/integer/string: comparison coercion + // prefers numeric, so the common type for the CASE expr and + // WHEN values is Float32. let case = Case { expr: Some(Box::new(col("string"))), when_then_expr: vec![ @@ -2354,7 +2359,7 @@ mod test { ], else_expr: Some(Box::new(col("string"))), }; - let case_when_common_type = Utf8; + let case_when_common_type = DataType::Float32; let then_else_common_type = Utf8; let expected = cast_helper( case.clone(), diff --git a/datafusion/physical-expr/src/expressions/case.rs b/datafusion/physical-expr/src/expressions/case.rs index f1d867dddf369..bbb1b3ec25d79 100644 --- a/datafusion/physical-expr/src/expressions/case.rs +++ b/datafusion/physical-expr/src/expressions/case.rs @@ -1431,7 +1431,7 @@ mod tests { use datafusion_common::cast::{as_float64_array, as_int32_array}; use datafusion_common::plan_err; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; - use datafusion_expr::type_coercion::binary::comparison_coercion; + use datafusion_expr::type_coercion::binary::type_union_coercion; use datafusion_expr_common::operator::Operator; use datafusion_physical_expr_common::physical_expr::fmt_sql; use half::f16; @@ -2381,9 +2381,7 @@ mod tests { thens_type .iter() .try_fold(else_type, |left_type, right_type| { - // TODO: now just use the `equal` coercion rule for case when. If find the issue, and - // refactor again. - comparison_coercion(&left_type, right_type) + type_union_coercion(&left_type, right_type) }) } diff --git a/datafusion/sqllogictest/test_files/delete.slt b/datafusion/sqllogictest/test_files/delete.slt index b01eb6f5e9ec7..be6d5739e1f0f 100644 --- a/datafusion/sqllogictest/test_files/delete.slt +++ b/datafusion/sqllogictest/test_files/delete.slt @@ -45,7 +45,7 @@ explain delete from t1 where a = 1 and b = 2 and c > 3 and d != 4; ---- logical_plan 01)Dml: op=[Delete] table=[t1] -02)--Filter: CAST(t1.a AS Int64) = Int64(1) AND t1.b = CAST(Int64(2) AS Utf8View) AND t1.c > CAST(Int64(3) AS Float64) AND CAST(t1.d AS Int64) != Int64(4) +02)--Filter: CAST(t1.a AS Int64) = Int64(1) AND CAST(t1.b AS Int64) = Int64(2) AND t1.c > CAST(Int64(3) AS Float64) AND CAST(t1.d AS Int64) != Int64(4) 03)----TableScan: t1 physical_plan 01)CooperativeExec @@ -58,7 +58,7 @@ explain delete from t1 where t1.a = 1 and b = 2 and t1.c > 3 and d != 4; ---- logical_plan 01)Dml: op=[Delete] table=[t1] -02)--Filter: CAST(t1.a AS Int64) = Int64(1) AND t1.b = CAST(Int64(2) AS Utf8View) AND t1.c > CAST(Int64(3) AS Float64) AND CAST(t1.d AS Int64) != Int64(4) +02)--Filter: CAST(t1.a AS Int64) = Int64(1) AND CAST(t1.b AS Int64) = Int64(2) AND t1.c > CAST(Int64(3) AS Float64) AND CAST(t1.d AS Int64) != Int64(4) 03)----TableScan: t1 physical_plan 01)CooperativeExec diff --git a/datafusion/sqllogictest/test_files/dictionary.slt b/datafusion/sqllogictest/test_files/dictionary.slt index 511061cf82f06..5ec95260e8357 100644 --- a/datafusion/sqllogictest/test_files/dictionary.slt +++ b/datafusion/sqllogictest/test_files/dictionary.slt @@ -426,7 +426,8 @@ physical_plan 02)--DataSourceExec: partitions=1, partition_sizes=[1] -# Now query using an integer which must be coerced into a dictionary string +# Query using an integer literal: comparison coercion prefers numeric types, +# so the dictionary string column is cast to Int64 query TT SELECT * from test where column2 = 1; ---- @@ -436,10 +437,10 @@ query TT explain SELECT * from test where column2 = 1; ---- logical_plan -01)Filter: test.column2 = Dictionary(Int32, Utf8("1")) +01)Filter: CAST(test.column2 AS Int64) = Int64(1) 02)--TableScan: test projection=[column1, column2] physical_plan -01)FilterExec: column2@1 = 1 +01)FilterExec: CAST(column2@1 AS Int64) = 1 02)--DataSourceExec: partitions=1, partition_sizes=[1] # Window Functions diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index a6341bc686f74..9365f3896b618 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -1086,55 +1086,37 @@ SELECT 0.3 NOT IN (0.0,0.1,0.2,NULL) ---- NULL -query B +# Mixed string/integer IN lists: comparison coercion picks the numeric +# type, so non-numeric strings like 'a' fail to cast to Int64. +query error Cannot cast string 'a' to value of Int64 type SELECT '1' IN ('a','b',1) ----- -true -query B +query error Cannot cast string 'a' to value of Int64 type SELECT '2' IN ('a','b',1) ----- -false -query B +query error Cannot cast string 'a' to value of Int64 type SELECT '2' NOT IN ('a','b',1) ----- -true -query B +query error Cannot cast string 'a' to value of Int64 type SELECT '1' NOT IN ('a','b',1) ----- -false -query B +query error Cannot cast string 'a' to value of Int64 type SELECT NULL IN ('a','b',1) ----- -NULL -query B +query error Cannot cast string 'a' to value of Int64 type SELECT NULL NOT IN ('a','b',1) ----- -NULL -query B +query error Cannot cast string 'a' to value of Int64 type SELECT '1' IN ('a','b',NULL,1) ----- -true -query B +query error Cannot cast string 'a' to value of Int64 type SELECT '2' IN ('a','b',NULL,1) ----- -NULL -query B +query error Cannot cast string 'a' to value of Int64 type SELECT '1' NOT IN ('a','b',NULL,1) ----- -false -query B +query error Cannot cast string 'a' to value of Int64 type SELECT '2' NOT IN ('a','b',NULL,1) ----- -NULL # ======================================================================== # Comprehensive IN LIST tests with NULL handling diff --git a/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt b/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt index e1c83c8c330d8..d612b01b5427b 100644 --- a/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt +++ b/datafusion/sqllogictest/test_files/push_down_filter_parquet.slt @@ -122,24 +122,6 @@ explain select a from t where a != '100'; ---- physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/t.parquet]]}, projection=[a], file_type=parquet, predicate=a@0 != 100, pruning_predicate=a_null_count@2 != row_count@3 AND (a_min@0 != 100 OR 100 != a_max@1), required_guarantees=[a not in (100)] -# The predicate should still have the column cast when the value is a NOT valid i32 -query TT -explain select a from t where a = '99999999999'; ----- -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 99999999999 - -# The predicate should still have the column cast when the value is a NOT valid i32 -query TT -explain select a from t where a = '99.99'; ----- -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 99.99 - -# The predicate should still have the column cast when the value is a NOT valid i32 -query TT -explain select a from t where a = ''; ----- -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter_parquet/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = - # The predicate should not have a column cast when the operator is = or != and the literal can be round-trip casted without losing information. query TT explain select a from t where cast(a as string) = '100'; diff --git a/datafusion/sqllogictest/test_files/string/string_query.slt.part b/datafusion/sqllogictest/test_files/string/string_query.slt.part index 2884c3518610d..679ba0aa8a888 100644 --- a/datafusion/sqllogictest/test_files/string/string_query.slt.part +++ b/datafusion/sqllogictest/test_files/string/string_query.slt.part @@ -41,38 +41,17 @@ NULL R NULL 🔥 # -------------------------------------- # test type coercion (compare to int) -# queries should not error +# +# Comparing a string column to an integer literal is allowed but will fail +# at runtime if the string column contains any values that can't be cast +# to integers. # -------------------------------------- -query BB +statement error Arrow error: Cast error: Cannot cast string 'Andrew' to value of Int64 type select ascii_1 = 1 as col1, 1 = ascii_1 as col2 from test_basic_operator; ----- -false false -false false -false false -false false -false false -false false -false false -false false -false false -NULL NULL -NULL NULL -query BB +statement error Arrow error: Cast error: Cannot cast string 'Andrew' to value of Int64 type select ascii_1 <> 1 as col1, 1 <> ascii_1 as col2 from test_basic_operator; ----- -true true -true true -true true -true true -true true -true true -true true -true true -true true -NULL NULL -NULL NULL # Coercion to date/time query BBB diff --git a/datafusion/sqllogictest/test_files/string_numeric_coercion.slt b/datafusion/sqllogictest/test_files/string_numeric_coercion.slt new file mode 100644 index 0000000000000..27b1be1074769 --- /dev/null +++ b/datafusion/sqllogictest/test_files/string_numeric_coercion.slt @@ -0,0 +1,496 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +########## +## Tests for string-numeric comparison coercion +## Verifies that when comparing a numeric column to a string literal, +## the comparison is performed numerically (not lexicographically). +## See: https://github.com/apache/datafusion/issues/15161 +########## + +# Setup test data +statement ok +CREATE TABLE t_int AS VALUES (1), (5), (325), (499), (1000); + +statement ok +CREATE TABLE t_float AS VALUES (1.5), (5.0), (325.7), (499.9), (1000.1); + +# ------------------------------------------------- +# Integer column with comparison operators vs string literals. +# Ensure that the comparison is done with numeric semantics, +# not lexicographically. +# ------------------------------------------------- + +query I rowsort +SELECT * FROM t_int WHERE column1 < '5'; +---- +1 + +query I rowsort +SELECT * FROM t_int WHERE column1 > '5'; +---- +1000 +325 +499 + +query I rowsort +SELECT * FROM t_int WHERE column1 <= '5'; +---- +1 +5 + +query I rowsort +SELECT * FROM t_int WHERE column1 >= '5'; +---- +1000 +325 +499 +5 + +query I rowsort +SELECT * FROM t_int WHERE column1 = '5'; +---- +5 + +query I rowsort +SELECT * FROM t_int WHERE column1 != '5'; +---- +1 +1000 +325 +499 + +query I rowsort +SELECT * FROM t_int WHERE column1 < '10'; +---- +1 +5 + +query I rowsort +SELECT * FROM t_int WHERE column1 <= '100'; +---- +1 +5 + +query I rowsort +SELECT * FROM t_int WHERE column1 > '100'; +---- +1000 +325 +499 + +# ------------------------------------------------- +# Float column with comparison operators vs string literals +# ------------------------------------------------- + +query R rowsort +SELECT * FROM t_float WHERE column1 < '5'; +---- +1.5 + +query R rowsort +SELECT * FROM t_float WHERE column1 > '5'; +---- +1000.1 +325.7 +499.9 + +query R rowsort +SELECT * FROM t_float WHERE column1 = '5'; +---- +5 + +query R rowsort +SELECT * FROM t_float WHERE column1 = '5.0'; +---- +5 + +# ------------------------------------------------- +# Error on strings that cannot be cast to the numeric column type +# ------------------------------------------------- + +# Non-numeric string against integer column +statement error Arrow error: Cast error: Cannot cast string 'hello' to value of Int64 type +SELECT * FROM t_int WHERE column1 < 'hello'; + +# Non-numeric string against float column +statement error Arrow error: Cast error: Cannot cast string 'hello' to value of Float64 type +SELECT * FROM t_float WHERE column1 < 'hello'; + +# Float string against integer column +statement error Arrow error: Cast error: Cannot cast string '99.99' to value of Int64 type +SELECT * FROM t_int WHERE column1 = '99.99'; + +# Empty string against integer column +statement error Arrow error: Cast error: Cannot cast string '' to value of Int64 type +SELECT * FROM t_int WHERE column1 = ''; + +# Empty string against float column +statement error Arrow error: Cast error: Cannot cast string '' to value of Float64 type +SELECT * FROM t_float WHERE column1 = ''; + +# Overflow +statement error Arrow error: Cast error: Cannot cast string '99999999999999999999' to value of Int64 type +SELECT * FROM t_int WHERE column1 = '99999999999999999999'; + + +# ------------------------------------------------- +# UNION still uses string coercion (type unification context) +# ------------------------------------------------- + +statement ok +CREATE TABLE t_str AS VALUES ('one'), ('two'), ('three'); + +query T rowsort +SELECT column1 FROM t_int UNION ALL SELECT column1 FROM t_str; +---- +1 +1000 +325 +499 +5 +one +three +two + +# Verify the UNION coerces to Utf8 (not numeric) +query TT +EXPLAIN SELECT column1 FROM t_int UNION ALL SELECT column1 FROM t_str; +---- +logical_plan +01)Union +02)--Projection: CAST(t_int.column1 AS Utf8) AS column1 +03)----TableScan: t_int projection=[column1] +04)--TableScan: t_str projection=[column1] +physical_plan +01)UnionExec +02)--ProjectionExec: expr=[CAST(column1@0 AS Utf8) as column1] +03)----DataSourceExec: partitions=1, partition_sizes=[1] +04)--DataSourceExec: partitions=1, partition_sizes=[1] + +# ------------------------------------------------- +# BETWEEN uses comparison coercion (numeric preferred) +# ------------------------------------------------- + +query I rowsort +SELECT * FROM t_int WHERE column1 BETWEEN '5' AND '100'; +---- +5 + +# ------------------------------------------------- +# IN list uses comparison coercion (numeric preferred) +# `x IN (a, b)` is semantically equivalent to `x = a OR x = b` +# ------------------------------------------------- + +# Basic IN list with string literals against integer column +query I rowsort +SELECT * FROM t_int WHERE column1 IN ('5', '325'); +---- +325 +5 + +# IN list with a value where numeric coercion matters +query I rowsort +SELECT * FROM t_int WHERE column1 IN ('1000'); +---- +1000 + +# IN list with NOT +query I rowsort +SELECT * FROM t_int WHERE column1 NOT IN ('1', '5'); +---- +1000 +325 +499 + +# IN list with float column +query R rowsort +SELECT * FROM t_float WHERE column1 IN ('5.0', '325.7'); +---- +325.7 +5 + +# Verify the plan shows numeric coercion (not CAST to Utf8) +query TT +EXPLAIN SELECT * FROM t_int WHERE column1 IN ('5', '325'); +---- +logical_plan +01)Filter: t_int.column1 = Int64(5) OR t_int.column1 = Int64(325) +02)--TableScan: t_int projection=[column1] +physical_plan +01)FilterExec: column1@0 = 5 OR column1@0 = 325 +02)--DataSourceExec: partitions=1, partition_sizes=[1] + +# Error on invalid string in IN list +statement error Arrow error: Cast error: Cannot cast string 'hello' to value of Int64 type +SELECT * FROM t_int WHERE column1 IN ('5', 'hello'); + +# Mixed numeric literal and string literal in IN list against integer column +query I rowsort +SELECT * FROM t_int WHERE column1 IN (5, '325'); +---- +325 +5 + +# Mixed numeric literal and string literal in IN list against float column +query R rowsort +SELECT * FROM t_float WHERE column1 IN (5, '325.7'); +---- +325.7 +5 + +# String and numeric literal order reversed +query R rowsort +SELECT * FROM t_float WHERE column1 IN ('5', 325.7); +---- +325.7 +5 + +# Float string literal against integer column errors (cannot cast '10.0' to Int64) +statement error Arrow error: Cast error: Cannot cast string '10.0' to value of Int64 type +SELECT * FROM t_int WHERE column1 IN (5, '10.0'); + +# Non-numeric string in mixed IN list still errors +statement error Arrow error: Cast error: Cannot cast string 'hello' to value of Int64 type +SELECT * FROM t_int WHERE column1 IN ('hello', 5); + +# ------------------------------------------------- +# CASE WHEN uses comparison coercion for conditions (numeric preferred) +# `CASE expr WHEN val` is semantically equivalent to `expr = val` +# ------------------------------------------------- + +# Basic CASE with integer column and string WHEN values +query T rowsort +SELECT CASE column1 WHEN '5' THEN 'five' WHEN '1000' THEN 'thousand' ELSE 'other' END FROM t_int; +---- +five +other +other +other +thousand + +# CASE with float column: '5' is cast to 5.0 numerically, matching the row. +# (Under string comparison, '5.0' != '5' would fail to match.) +query T rowsort +SELECT CASE column1 WHEN '5' THEN 'matched' ELSE 'no match' END FROM t_float; +---- +matched +no match +no match +no match +no match + +# THEN/ELSE results still use type union coercion (string preferred), +# so mixing numeric and string coerces to string +query T rowsort +SELECT CASE WHEN column1 > 500 THEN column1 ELSE 'small' END FROM t_int; +---- +1000 +small +small +small +small + +# ------------------------------------------------- +# Nested struct/map/list comparisons use comparison coercion +# (numeric preferred) for their field/element types +# ------------------------------------------------- + +statement ok +CREATE TABLE t_struct_int AS SELECT named_struct('val', column1) as s FROM (VALUES (1), (5), (10)); + +statement ok +CREATE TABLE t_struct_str AS SELECT named_struct('val', column1) as s FROM (VALUES ('5'), ('10')); + +# Struct comparison: the string field is cast to Int64 (numeric preferred). +query ? rowsort +SELECT t1.s FROM t_struct_int t1, t_struct_str t2 WHERE t1.s = t2.s; +---- +{val: 10} +{val: 5} + +# Struct in UNION uses type union coercion (string preferred). +# The integer struct field is cast to Utf8. +query ? rowsort +SELECT s FROM t_struct_int UNION ALL SELECT s FROM t_struct_str; +---- +{val: 10} +{val: 10} +{val: 1} +{val: 5} +{val: 5} + +statement ok +DROP TABLE t_struct_int; + +statement ok +DROP TABLE t_struct_str; + +# List comparison: string elements are cast to Int64 (numeric preferred). +statement ok +CREATE TABLE t_list_int AS SELECT column1 as l FROM (VALUES ([1, 5, 10]), ([20, 30])); + +statement ok +CREATE TABLE t_list_str AS SELECT column1 as l FROM (VALUES (['5', '10']), (['20', '30'])); + +# Verify the element types are Int64 and Utf8 respectively +query T +SELECT arrow_typeof(l) FROM t_list_int LIMIT 1; +---- +List(Int64) + +query T +SELECT arrow_typeof(l) FROM t_list_str LIMIT 1; +---- +List(Utf8) + +query ? rowsort +SELECT t1.l FROM t_list_int t1, t_list_str t2 WHERE t1.l = t2.l; +---- +[20, 30] + +# List in UNION uses type union coercion (string preferred). +# The integer list elements are cast to Utf8. +query ? rowsort +SELECT l FROM t_list_int UNION ALL SELECT l FROM t_list_str; +---- +[1, 5, 10] +[20, 30] +[20, 30] +[5, 10] + +statement ok +DROP TABLE t_list_int; + +statement ok +DROP TABLE t_list_str; + +# Map comparison: string values are cast to Int64 (numeric preferred). +statement ok +CREATE TABLE t_map_int AS SELECT MAP {'a': 1, 'b': 5} as m; + +statement ok +CREATE TABLE t_map_str AS SELECT MAP {'a': '1', 'b': '5'} as m; + +# Verify the value types are Int64 and Utf8 respectively +query T +SELECT arrow_typeof(m) FROM t_map_int LIMIT 1; +---- +Map("entries": non-null Struct("key": non-null Utf8, "value": Int64), unsorted) + +query T +SELECT arrow_typeof(m) FROM t_map_str LIMIT 1; +---- +Map("entries": non-null Struct("key": non-null Utf8, "value": Utf8), unsorted) + +query ? rowsort +SELECT t1.m FROM t_map_int t1, t_map_str t2 WHERE t1.m = t2.m; +---- +{a: 1, b: 5} + +# Map in UNION uses type union coercion (string preferred). +# The integer map values are cast to Utf8. +query ? rowsort +SELECT m FROM t_map_int UNION ALL SELECT m FROM t_map_str; +---- +{a: 1, b: 5} +{a: 1, b: 5} + +statement ok +DROP TABLE t_map_int; + +statement ok +DROP TABLE t_map_str; + +# ------------------------------------------------- +# LIKE / regex on dictionary-encoded numeric columns should error, +# consistent with LIKE on plain numeric columns +# ------------------------------------------------- + +# Plain integer column: LIKE is not supported +statement error There isn't a common type to coerce Int64 and Utf8 in LIKE expression +SELECT * FROM t_int WHERE column1 LIKE '%5%'; + +# Dictionary-encoded integer column: should also error +statement error There isn't a common type to coerce Dictionary\(Int32, Int64\) and Utf8 in LIKE expression +SELECT arrow_cast(column1, 'Dictionary(Int32, Int64)') LIKE '%5%' FROM t_int; + +# Dictionary-encoded string column: LIKE works as normal +query B rowsort +SELECT arrow_cast('hello', 'Dictionary(Int32, Utf8)') LIKE '%ell%'; +---- +true + +# REE-encoded integer column: LIKE should also error +statement error There isn't a common type to coerce RunEndEncoded.* and Utf8 in LIKE expression +SELECT arrow_cast(column1, 'RunEndEncoded("run_ends": non-null Int32, "values": Int64)') LIKE '%5%' FROM t_int; + +# REE-encoded string column: LIKE works as normal +query B rowsort +SELECT arrow_cast('hello', 'RunEndEncoded("run_ends": non-null Int32, "values": Utf8)') LIKE '%ell%'; +---- +true + +# Dictionary-encoded integer column: regex should error +statement error Cannot infer common argument type for regex operation +SELECT arrow_cast(column1, 'Dictionary(Int32, Int64)') ~ '5' FROM t_int; + +# Dictionary-encoded string column: regex works as normal +query B rowsort +SELECT arrow_cast('hello', 'Dictionary(Int32, Utf8)') ~ 'ell'; +---- +true + +# REE-encoded integer column: regex should error +statement error Cannot infer common argument type for regex operation +SELECT arrow_cast(column1, 'RunEndEncoded("run_ends": non-null Int32, "values": Int64)') ~ '5' FROM t_int; + +# REE-encoded string column: regex works as normal +query B rowsort +SELECT arrow_cast('hello', 'RunEndEncoded("run_ends": non-null Int32, "values": Utf8)') ~ 'ell'; +---- +true + +# ------------------------------------------------- +# Cleanup +# ------------------------------------------------- + +statement ok +DROP TABLE t_int; + +statement ok +DROP TABLE t_float; + +statement ok +DROP TABLE t_str; + +# ------------------------------------------------- +# List element coercion should reject mixed +# numeric/string categories (same as array literals) +# ------------------------------------------------- + +# Array literal with mixed numeric/string elements errors +query error Cannot cast string 'a' to value of Int64 type +SELECT [1, 'a']; + +# MAP with mixed-category list keys should also error +query error +SELECT MAP {[1,2,3]:1, ['a', 'b']:2}; + +# MAP with mixed-category list values should also error +query error +SELECT MAP {'a':[1,2,3], 'b':['a', 'b']}; diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs index 926eb8a343f01..18b6f8560d552 100644 --- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs @@ -568,7 +568,7 @@ async fn try_cast_decimal_to_int() -> Result<()> { #[tokio::test] async fn try_cast_decimal_to_string() -> Result<()> { - roundtrip("SELECT * FROM data WHERE a = TRY_CAST(b AS string)").await + roundtrip("SELECT * FROM data WHERE f = TRY_CAST(b AS string)").await } #[tokio::test] diff --git a/docs/source/library-user-guide/upgrading/53.0.0.md b/docs/source/library-user-guide/upgrading/53.0.0.md index ef5f5743f5ea6..8d13ba97a4c64 100644 --- a/docs/source/library-user-guide/upgrading/53.0.0.md +++ b/docs/source/library-user-guide/upgrading/53.0.0.md @@ -440,6 +440,67 @@ let sort_proto = serialize_physical_sort_expr( ); ``` +### String/numeric comparison coercion now prefers numeric types + +Previously, comparing a numeric column with a string value (e.g., +`WHERE int_col > '100'`) coerced both sides to strings and performed a +lexicographic comparison. This produced incorrect results — for example, +`5 > '100'` was `true` under string comparison because `'5' > '1'` +lexicographically, even though `5 > 100` is `false` numerically. + +DataFusion now coerces the string side to the numeric type in comparison +contexts (`=`, `<`, `>`, `<=`, `>=`, `<>`, `IN`, `BETWEEN`, `CASE .. WHEN`). + +**Who is affected:** + +- Queries that compare numeric values with string values +- Queries that use `IN` lists with mixed string and numeric types +- Queries that use `CASE expr WHEN` with mixed string and numeric types + +**Behavioral changes:** + +| Expression | Old behavior | New behavior | +| ----------------------- | ------------------------------- | ------------------------------------------ | +| `int_col > '100'` | Lexicographic | Numeric | +| `float_col = '5'` | String `'5' != '5.0'` | Numeric `5.0 = 5.0` | +| `int_col = 'hello'` | String comparison, always false | Cast error | +| `str_col IN ('a', 1)` | Coerce to Utf8 | Cast error (`'a'` cannot be cast to Int64) | +| `float_col IN ('1.0')` | String `'1.0' != '1'` | Numeric `1.0 = 1.0` (correct) | +| `CASE str_col WHEN 1.0` | Coerce to Utf8 | Coerce to Float64 | + +**Migration guide:** + +Most queries will produce more correct results with no changes needed. +However, queries that relied on the old string-comparison behavior may need +adjustment: + +- **Queries comparing numeric columns with non-numeric strings** (e.g., + `int_col = 'hello'` or `int_col > text_col` where `text_col` contains + non-numeric values) will now produce a cast error instead of silently + returning no rows. +- **Mixed-type `IN` lists** (e.g., `str_col IN ('a', 1)`) are now rejected. Use + consistent types for the `IN` list or add an explicit `CAST`. +- **Queries comparing integer columns with decimal strings** (e.g., + `int_col = '99.99'`) will now produce a cast error because `'99.99'` + cannot be cast to an integer. Use a float column or adjust the literal. + +See [#15161](https://github.com/apache/datafusion/issues/15161) and +[PR #20426](https://github.com/apache/datafusion/pull/20426) for details. + +### `comparison_coercion_numeric` removed, replaced by `comparison_coercion` + +The `comparison_coercion_numeric` function has been removed. Its behavior +(preferring numeric types for string/numeric comparisons) is now the default in +`comparison_coercion`. A new function, `type_union_coercion`, handles contexts +where string types are preferred (`UNION`, `CASE THEN/ELSE`, `NVL2`). + +**Who is affected:** + +- Crates that call `comparison_coercion_numeric` directly +- Crates that call `comparison_coercion` and relied on its old + string-preferring behavior +- Crates that call `get_coerce_type_for_case_expression` + ### `generate_series` and `range` table functions changed The `generate_series` and `range` table functions now return an empty set when the interval is invalid, instead of an error.