apache · andygrove · Apr 17, 2026
diff --git a/spark/src/test/resources/sql-tests/expressions/string/encode.sql b/spark/src/test/resources/sql-tests/expressions/string/encode.sql
@@ -0,0 +1,153 @@
+-- Licensed to the Apache Software Foundation (ASF) under one
+-- or more contributor license agreements.  See the NOTICE file
+-- distributed with this work for additional information
+-- regarding copyright ownership.  The ASF licenses this file
+-- to you under the Apache License, Version 2.0 (the
+-- "License"); you may not use this file except in compliance
+-- with the License.  You may obtain a copy of the License at
+--
+--   http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing,
+-- software distributed under the License is distributed on an
+-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+-- KIND, either express or implied.  See the License for the
+-- specific language governing permissions and limitations
+-- under the License.
+
+-- Tests for encode(str, charset). These cover the canonical charsets that Spark
+-- supports in both legacy and strict modes, so the expected output is the same
+-- across Spark 3.4, 3.5, and 4.0+. The charset-whitelist enforcement and the
+-- default raise-on-unmappable behavior introduced in Spark 4.0 live in
+-- encode_strict.sql.
+
+statement
+CREATE TABLE test_encode(s string, b binary) USING parquet
+
+statement
+INSERT INTO test_encode VALUES
+  ('Spark SQL',   CAST(x'48656C6C6F' AS BINARY)),
+  ('',            CAST(x'' AS BINARY)),
+  ('naïve',       CAST(x'FFFE' AS BINARY)),
+  ('😀',          CAST(x'F09F9880' AS BINARY)),
+  (NULL,          NULL)
+
+-- ============================================================================
+-- UTF-8 encoding (identity for valid UTF-8 input)
+-- ============================================================================
+
+-- column argument
+query spark_answer_only
+SELECT hex(encode(s, 'UTF-8')) FROM test_encode
+
+-- literal argument
+query spark_answer_only
+SELECT hex(encode('Spark SQL', 'UTF-8'))
+
+-- case-insensitive charset
+query spark_answer_only
+SELECT hex(encode('Spark SQL', 'utf-8')), hex(encode('Spark SQL', 'Utf-8'))
+
+-- empty string returns empty binary, not NULL
+query spark_answer_only
+SELECT encode('', 'UTF-8') IS NULL, length(encode('', 'UTF-8'))
+
+-- emoji (4-byte UTF-8 sequence)
+query spark_answer_only
+SELECT hex(encode('😀', 'UTF-8'))
+
+-- ============================================================================
+-- US-ASCII encoding
+-- ============================================================================
+
+query spark_answer_only
+SELECT hex(encode(s, 'US-ASCII')) FROM test_encode WHERE s IN ('Spark SQL', '')
+
+query spark_answer_only
+SELECT hex(encode('Hello', 'US-ASCII'))
+
+-- ============================================================================
+-- ISO-8859-1 encoding (Latin-1 characters fit in a single byte)
+-- ============================================================================
+
+query spark_answer_only
+SELECT hex(encode(s, 'ISO-8859-1')) FROM test_encode WHERE s IN ('Spark SQL', 'naïve')
+
+query spark_answer_only
+SELECT hex(encode('naïve', 'ISO-8859-1'))
+
+-- ============================================================================
+-- UTF-16 encoding (Spark emits a big-endian BOM FEFF followed by UTF-16BE)
+-- ============================================================================
+
+query spark_answer_only
+SELECT hex(encode('AB', 'UTF-16'))
+
+-- emoji encodes as a surrogate pair, still preceded by the BOM
+query spark_answer_only
+SELECT hex(encode('😀', 'UTF-16'))
+
+-- ============================================================================
+-- UTF-16BE encoding (no BOM)
+-- ============================================================================
+
+query spark_answer_only
+SELECT hex(encode('AB', 'UTF-16BE'))
+
+-- emoji surrogate pair, big-endian
+query spark_answer_only
+SELECT hex(encode('😀', 'UTF-16BE'))
+
+-- ============================================================================
+-- UTF-16LE encoding (no BOM)
+-- ============================================================================
+
+query spark_answer_only
+SELECT hex(encode('AB', 'UTF-16LE'))
+
+-- emoji surrogate pair, little-endian
+query spark_answer_only
+SELECT hex(encode('😀', 'UTF-16LE'))
+
+-- ============================================================================
+-- UTF-32 encoding (Spark does NOT emit a BOM for UTF-32)
+-- ============================================================================
+
+query spark_answer_only
+SELECT hex(encode('A', 'UTF-32'))
+
+query spark_answer_only
+SELECT hex(encode('😀', 'UTF-32'))
+
+-- ============================================================================
+-- NULL handling
+-- ============================================================================
+
+-- NULL string input returns NULL
+query spark_answer_only
+SELECT hex(encode(CAST(NULL AS STRING), 'UTF-8'))
+
+-- NULL charset returns NULL
+query spark_answer_only
+SELECT hex(encode('hello', CAST(NULL AS STRING)))
+
+-- NULL in a column
+query spark_answer_only
+SELECT hex(encode(s, 'UTF-8')) FROM test_encode WHERE s IS NULL
+
+-- ============================================================================
+-- Binary input (Spark implicitly casts BINARY to STRING, invalid UTF-8 bytes
+-- become U+FFFD which is EF BF BD in UTF-8)
+-- ============================================================================
+
+-- valid UTF-8 binary round-trips
+query spark_answer_only
+SELECT hex(encode(CAST(x'48656C6C6F' AS BINARY), 'UTF-8'))
+
+-- invalid UTF-8 binary: each invalid byte becomes U+FFFD
+query spark_answer_only
+SELECT hex(encode(CAST(x'FFFE' AS BINARY), 'UTF-8'))
+
+-- binary column input
+query spark_answer_only
+SELECT hex(encode(b, 'UTF-8')) FROM test_encode WHERE b IS NOT NULL
diff --git a/spark/src/test/resources/sql-tests/expressions/string/encode_strict.sql b/spark/src/test/resources/sql-tests/expressions/string/encode_strict.sql
@@ -0,0 +1,88 @@
+-- Licensed to the Apache Software Foundation (ASF) under one
+-- or more contributor license agreements.  See the NOTICE file
+-- distributed with this work for additional information
+-- regarding copyright ownership.  The ASF licenses this file
+-- to you under the Apache License, Version 2.0 (the
+-- "License"); you may not use this file except in compliance
+-- with the License.  You may obtain a copy of the License at
+--
+--   http://www.apache.org/licenses/LICENSE-2.0
+--
+-- Unless required by applicable law or agreed to in writing,
+-- software distributed under the License is distributed on an
+-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+-- KIND, either express or implied.  See the License for the
+-- specific language governing permissions and limitations
+-- under the License.
+
+-- Tests for the strict charset whitelist and raise-on-unmappable behavior that
+-- Spark 4.0 enabled by default. Earlier Spark versions have
+-- spark.sql.legacy.javaCharsets=true and spark.sql.legacy.codingErrorAction=true
+-- by default, which permit extra aliases and replace unmappable characters with
+-- '?', so these assertions only hold on Spark 4.0 and later.
+
+-- MinSparkVersion: 4.0
+
+-- ============================================================================
+-- Charset whitelist: Spark accepts exactly us-ascii, iso-8859-1, utf-8,
+-- utf-16, utf-16be, utf-16le, utf-32. Anything else raises
+-- INVALID_PARAMETER_VALUE.CHARSET.
+-- ============================================================================
+
+-- UTF-32BE and UTF-32LE are not accepted (only UTF-32 is)
+query expect_error(INVALID_PARAMETER_VALUE.CHARSET)
+SELECT encode('A', 'UTF-32BE')
+
+query expect_error(INVALID_PARAMETER_VALUE.CHARSET)
+SELECT encode('A', 'UTF-32LE')
+
+-- Aliases without the hyphen are not accepted
+query expect_error(INVALID_PARAMETER_VALUE.CHARSET)
+SELECT encode('abc', 'UTF8')
+
+query expect_error(INVALID_PARAMETER_VALUE.CHARSET)
+SELECT encode('abc', 'UTF16')
+
+query expect_error(INVALID_PARAMETER_VALUE.CHARSET)
+SELECT encode('abc', 'UTF16BE')
+
+-- ASCII without the US- prefix is not accepted
+query expect_error(INVALID_PARAMETER_VALUE.CHARSET)
+SELECT encode('abc', 'ASCII')
+
+-- ISO-8859-1 aliases LATIN1 and ISO88591 are not accepted
+query expect_error(INVALID_PARAMETER_VALUE.CHARSET)
+SELECT encode('abc', 'LATIN1')
+
+query expect_error(INVALID_PARAMETER_VALUE.CHARSET)
+SELECT encode('abc', 'ISO88591')
+
+-- Completely unknown charsets
+query expect_error(INVALID_PARAMETER_VALUE.CHARSET)
+SELECT encode('abc', 'EBCDIC')
+
+-- ============================================================================
+-- Raise on unmappable characters (legacy.codingErrorAction defaults to false)
+-- ============================================================================
+
+-- U+00E9 (é) is not representable in US-ASCII
+query expect_error(MALFORMED_CHARACTER_CODING)
+SELECT encode('é', 'US-ASCII')
+
+-- U+0100 (Ā) is not representable in ISO-8859-1
+query expect_error(MALFORMED_CHARACTER_CODING)
+SELECT encode(CAST(x'C480' AS BINARY), 'ISO-8859-1')
+
+-- emoji is not representable in US-ASCII
+query expect_error(MALFORMED_CHARACTER_CODING)
+SELECT encode('😀', 'US-ASCII')
+
+-- column argument with an unmappable value also raises
+statement
+CREATE TABLE test_encode_unmappable(s string) USING parquet
+
+statement
+INSERT INTO test_encode_unmappable VALUES ('é')
+
+query expect_error(MALFORMED_CHARACTER_CODING)
+SELECT encode(s, 'US-ASCII') FROM test_encode_unmappable