diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 017a123eb035b..6a390067bea27 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -272,8 +272,10 @@ impl LogicalPlanBuilder { let n_cols = values[0].len(); let mut fields = ValuesFields::new(); for j in 0..n_cols { - let field_type = schema.field(j).data_type(); - let field_nullable = schema.field(j).is_nullable(); + let field = schema.field(j); + let field_type = field.data_type(); + let field_nullable = field.is_nullable(); + let field_metadata = FieldMetadata::new_from_field(field); for row in values.iter() { let value = &row[j]; let data_type = value.get_type(schema)?; @@ -288,7 +290,12 @@ impl LogicalPlanBuilder { ); } } - fields.push(field_type.to_owned(), field_nullable); + let metadata = if field_metadata.is_empty() { + None + } else { + Some(field_metadata) + }; + fields.push_with_metadata(field_type.to_owned(), field_nullable, metadata); } Self::infer_inner(values, fields, schema) @@ -1567,10 +1574,6 @@ impl ValuesFields { Self::default() } - pub fn push(&mut self, data_type: DataType, nullable: bool) { - self.push_with_metadata(data_type, nullable, None); - } - pub fn push_with_metadata( &mut self, data_type: DataType, diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 32daf65a71fa4..37ee2f2f8eed2 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -641,6 +641,21 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // If no type_planner can handle this type, use the default conversion match sql_type { + // Canonical Arrow extension types + SQLDataType::Uuid => Ok(Arc::new( + Field::new("", DataType::FixedSizeBinary(16), true).with_metadata( + HashMap::from([( + "ARROW:extension:name".to_string(), + "arrow.uuid".to_string(), + )]), + ), + )), + SQLDataType::JSON | SQLDataType::JSONB => Ok(Arc::new( + Field::new("", DataType::Utf8, true).with_metadata(HashMap::from([( + "ARROW:extension:name".to_string(), + "arrow.json".to_string(), + )])), + )), SQLDataType::Array(ArrayElemTypeDef::AngleBracket(inner_sql_type)) => { // Arrays may be multi-dimensional. Ok(self.convert_data_type_to_field(inner_sql_type)?.into_list()) diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index b91e38e53776a..5f3d6ebe4cabc 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -31,6 +31,7 @@ use crate::utils::normalize_ident; use arrow::datatypes::{Field, FieldRef, Fields}; use datafusion_common::error::_plan_err; +use datafusion_common::metadata::FieldMetadata; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{ Column, Constraint, Constraints, DFSchema, DFSchemaRef, DataFusionError, Result, @@ -44,7 +45,7 @@ use datafusion_expr::logical_plan::DdlStatement; use datafusion_expr::logical_plan::builder::project; use datafusion_expr::utils::expr_to_columns; use datafusion_expr::{ - Analyze, CreateCatalog, CreateCatalogSchema, + Analyze, Cast, CreateCatalog, CreateCatalogSchema, CreateExternalTable as PlanCreateExternalTable, CreateFunction, CreateFunctionBody, CreateIndex as PlanCreateIndex, CreateMemoryTable, CreateView, Deallocate, DescribeTable, DmlStatement, DropCatalogSchema, DropFunction, DropTable, DropView, @@ -52,7 +53,7 @@ use datafusion_expr::{ LogicalPlan, LogicalPlanBuilder, OperateFunctionArg, PlanType, Prepare, ResetVariable, SetVariable, SortExpr, Statement as PlanStatement, ToStringifiedPlan, TransactionAccessMode, TransactionConclusion, TransactionEnd, - TransactionIsolationLevel, TransactionStart, Volatility, WriteOp, cast, col, + TransactionIsolationLevel, TransactionStart, Volatility, WriteOp, col, }; use sqlparser::ast::{ self, BeginTransactionKind, CheckConstraint, ForeignKeyConstraint, IndexColumn, @@ -530,11 +531,26 @@ impl SqlToRel<'_, S> { .iter() .zip(input_fields) .map(|(field, input_field)| { - cast( - col(input_field.name()), - field.data_type().clone(), - ) - .alias(field.name()) + let target_field = Arc::new( + Field::new( + field.name(), + field.data_type().clone(), + field.is_nullable(), + ) + .with_metadata(field.metadata().clone()), + ); + let metadata = + FieldMetadata::new_from_field(field.as_ref()); + let alias_metadata = if metadata.is_empty() { + None + } else { + Some(metadata) + }; + Expr::Cast(Cast::new_from_field( + Box::new(col(input_field.name())), + target_field, + )) + .alias_with_metadata(field.name(), alias_metadata) }) .collect::>(); diff --git a/datafusion/sql/tests/common/mod.rs b/datafusion/sql/tests/common/mod.rs index 2e91604d33a84..8ad42a6ffffe4 100644 --- a/datafusion/sql/tests/common/mod.rs +++ b/datafusion/sql/tests/common/mod.rs @@ -343,12 +343,6 @@ impl TypePlanner for CustomTypePlanner { sql_type: &sqlparser::ast::DataType, ) -> Result> { match sql_type { - sqlparser::ast::DataType::Uuid => Ok(Some(Arc::new( - Field::new("", DataType::FixedSizeBinary(16), true).with_metadata( - [("ARROW:extension:name".to_string(), "arrow.uuid".to_string())] - .into(), - ), - ))), sqlparser::ast::DataType::Datetime(precision) => { let precision = match precision { Some(0) => TimeUnit::Second, diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs index 33756e191909f..4097f7b82f1b3 100644 --- a/datafusion/sqllogictest/src/test_context.rs +++ b/datafusion/sqllogictest/src/test_context.rs @@ -27,9 +27,7 @@ use arrow::array::{ LargeStringArray, StringArray, TimestampNanosecondArray, UnionArray, }; use arrow::buffer::ScalarBuffer; -use arrow::datatypes::{ - DataType, Field, FieldRef, Schema, SchemaRef, TimeUnit, UnionFields, -}; +use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit, UnionFields}; use arrow::record_batch::RecordBatch; use datafusion::catalog::{ CatalogProvider, MemoryCatalogProvider, MemorySchemaProvider, SchemaProvider, Session, @@ -37,7 +35,6 @@ use datafusion::catalog::{ use datafusion::common::{DataFusionError, Result, not_impl_err}; use datafusion::functions::math::abs; use datafusion::logical_expr::async_udf::{AsyncScalarUDF, AsyncScalarUDFImpl}; -use datafusion::logical_expr::planner::TypePlanner; use datafusion::logical_expr::{ ColumnarValue, Expr, ScalarFunctionArgs, ScalarUDF, ScalarUDFImpl, Signature, Volatility, create_udf, @@ -56,7 +53,6 @@ use datafusion::common::cast::as_float64_array; use datafusion::execution::SessionStateBuilder; use datafusion::execution::runtime_env::RuntimeEnv; use log::info; -use sqlparser::ast; use tempfile::TempDir; /// Context for running tests @@ -67,23 +63,6 @@ pub struct TestContext { test_dir: Option, } -#[derive(Debug)] -struct SqlLogicTestTypePlanner; - -impl TypePlanner for SqlLogicTestTypePlanner { - fn plan_type_field(&self, sql_type: &ast::DataType) -> Result> { - match sql_type { - ast::DataType::Uuid => Ok(Some(Arc::new( - Field::new("", DataType::FixedSizeBinary(16), true).with_metadata( - [("ARROW:extension:name".to_string(), "arrow.uuid".to_string())] - .into(), - ), - ))), - _ => Ok(None), - } - } -} - impl TestContext { pub fn new(ctx: SessionContext) -> Self { Self { @@ -112,14 +91,6 @@ impl TestContext { state_builder = state_builder.with_spark_features(); } - if matches!( - relative_path.file_name().and_then(|name| name.to_str()), - Some("cast_extension_type_metadata.slt") - ) { - state_builder = - state_builder.with_type_planner(Arc::new(SqlLogicTestTypePlanner)); - } - let state = state_builder.build(); let mut test_ctx = TestContext::new(SessionContext::new_with_state(state)); diff --git a/datafusion/sqllogictest/test_files/sql_extension_types.slt b/datafusion/sqllogictest/test_files/sql_extension_types.slt new file mode 100644 index 0000000000000..bb67b437401ee --- /dev/null +++ b/datafusion/sqllogictest/test_files/sql_extension_types.slt @@ -0,0 +1,137 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at + +# http://www.apache.org/licenses/LICENSE-2.0 + +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Tests for built-in mapping of SQL types to Arrow canonical extension types. +# See: https://arrow.apache.org/docs/format/CanonicalExtensions.html + +####### +# UUID Extension Type +####### + +# CAST to UUID preserves extension metadata +query ?T +SELECT CAST(arrow_cast(X'00010203040506070809000102030506', 'FixedSizeBinary(16)') AS UUID), + arrow_metadata(CAST(arrow_cast(X'00010203040506070809000102030506', 'FixedSizeBinary(16)') AS UUID), 'ARROW:extension:name'); +---- +00010203040506070809000102030506 arrow.uuid + +# CREATE TABLE with UUID column preserves extension metadata through VALUES +statement ok +CREATE TABLE test_uuid ( + value UUID +) AS VALUES + (NULL); + +query TTT +DESCRIBE test_uuid; +---- +value FixedSizeBinary(16) YES + +query T +SELECT arrow_metadata(value, 'ARROW:extension:name') FROM test_uuid; +---- +arrow.uuid + +statement ok +DROP TABLE test_uuid; + +# UUID column with non-null binary value +statement ok +CREATE TABLE test_uuid2 ( + id UUID +) AS VALUES + (CAST(arrow_cast(X'00010203040506070809000102030506', 'FixedSizeBinary(16)') AS UUID)); + +query T +SELECT arrow_metadata(id, 'ARROW:extension:name') FROM test_uuid2; +---- +arrow.uuid + +statement ok +DROP TABLE test_uuid2; + +####### +# JSON Extension Type +####### + +# CAST to JSON preserves extension metadata +query TT +SELECT CAST('{"a": 1}' AS JSON), + arrow_metadata(CAST('{"a": 1}' AS JSON), 'ARROW:extension:name'); +---- +{"a": 1} arrow.json + +# CREATE TABLE with JSON column preserves extension metadata +statement ok +CREATE TABLE test_json ( + data JSON +) AS VALUES + (NULL); + +query TTT +DESCRIBE test_json; +---- +data Utf8 YES + +query T +SELECT arrow_metadata(data, 'ARROW:extension:name') FROM test_json; +---- +arrow.json + +statement ok +DROP TABLE test_json; + +# JSON column with non-null value +statement ok +CREATE TABLE test_json2 ( + data JSON +) AS VALUES + (CAST('{"hello": "world"}' AS JSON)); + +query T +SELECT arrow_metadata(data, 'ARROW:extension:name') FROM test_json2; +---- +arrow.json + +statement ok +DROP TABLE test_json2; + +####### +# JSONB Extension Type (maps to arrow.json) +####### + +# JSONB is treated as arrow.json +query TT +SELECT CAST('{"key": "value"}' AS JSONB), + arrow_metadata(CAST('{"key": "value"}' AS JSONB), 'ARROW:extension:name'); +---- +{"key": "value"} arrow.json + +# CREATE TABLE with JSONB column preserves extension metadata +statement ok +CREATE TABLE test_jsonb ( + data JSONB +) AS VALUES + (NULL); + +query T +SELECT arrow_metadata(data, 'ARROW:extension:name') FROM test_jsonb; +---- +arrow.json + +statement ok +DROP TABLE test_jsonb;