diff --git a/spark/src/test/resources/sql-tests/expressions/string/encode.sql b/spark/src/test/resources/sql-tests/expressions/string/encode.sql new file mode 100644 index 0000000000..7a537c738d --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/string/encode.sql @@ -0,0 +1,153 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Tests for encode(str, charset). These cover the canonical charsets that Spark +-- supports in both legacy and strict modes, so the expected output is the same +-- across Spark 3.4, 3.5, and 4.0+. The charset-whitelist enforcement and the +-- default raise-on-unmappable behavior introduced in Spark 4.0 live in +-- encode_strict.sql. + +statement +CREATE TABLE test_encode(s string, b binary) USING parquet + +statement +INSERT INTO test_encode VALUES + ('Spark SQL', CAST(x'48656C6C6F' AS BINARY)), + ('', CAST(x'' AS BINARY)), + ('naïve', CAST(x'FFFE' AS BINARY)), + ('😀', CAST(x'F09F9880' AS BINARY)), + (NULL, NULL) + +-- ============================================================================ +-- UTF-8 encoding (identity for valid UTF-8 input) +-- ============================================================================ + +-- column argument +query spark_answer_only +SELECT hex(encode(s, 'UTF-8')) FROM test_encode + +-- literal argument +query spark_answer_only +SELECT hex(encode('Spark SQL', 'UTF-8')) + +-- case-insensitive charset +query spark_answer_only +SELECT hex(encode('Spark SQL', 'utf-8')), hex(encode('Spark SQL', 'Utf-8')) + +-- empty string returns empty binary, not NULL +query spark_answer_only +SELECT encode('', 'UTF-8') IS NULL, length(encode('', 'UTF-8')) + +-- emoji (4-byte UTF-8 sequence) +query spark_answer_only +SELECT hex(encode('😀', 'UTF-8')) + +-- ============================================================================ +-- US-ASCII encoding +-- ============================================================================ + +query spark_answer_only +SELECT hex(encode(s, 'US-ASCII')) FROM test_encode WHERE s IN ('Spark SQL', '') + +query spark_answer_only +SELECT hex(encode('Hello', 'US-ASCII')) + +-- ============================================================================ +-- ISO-8859-1 encoding (Latin-1 characters fit in a single byte) +-- ============================================================================ + +query spark_answer_only +SELECT hex(encode(s, 'ISO-8859-1')) FROM test_encode WHERE s IN ('Spark SQL', 'naïve') + +query spark_answer_only +SELECT hex(encode('naïve', 'ISO-8859-1')) + +-- ============================================================================ +-- UTF-16 encoding (Spark emits a big-endian BOM FEFF followed by UTF-16BE) +-- ============================================================================ + +query spark_answer_only +SELECT hex(encode('AB', 'UTF-16')) + +-- emoji encodes as a surrogate pair, still preceded by the BOM +query spark_answer_only +SELECT hex(encode('😀', 'UTF-16')) + +-- ============================================================================ +-- UTF-16BE encoding (no BOM) +-- ============================================================================ + +query spark_answer_only +SELECT hex(encode('AB', 'UTF-16BE')) + +-- emoji surrogate pair, big-endian +query spark_answer_only +SELECT hex(encode('😀', 'UTF-16BE')) + +-- ============================================================================ +-- UTF-16LE encoding (no BOM) +-- ============================================================================ + +query spark_answer_only +SELECT hex(encode('AB', 'UTF-16LE')) + +-- emoji surrogate pair, little-endian +query spark_answer_only +SELECT hex(encode('😀', 'UTF-16LE')) + +-- ============================================================================ +-- UTF-32 encoding (Spark does NOT emit a BOM for UTF-32) +-- ============================================================================ + +query spark_answer_only +SELECT hex(encode('A', 'UTF-32')) + +query spark_answer_only +SELECT hex(encode('😀', 'UTF-32')) + +-- ============================================================================ +-- NULL handling +-- ============================================================================ + +-- NULL string input returns NULL +query spark_answer_only +SELECT hex(encode(CAST(NULL AS STRING), 'UTF-8')) + +-- NULL charset returns NULL +query spark_answer_only +SELECT hex(encode('hello', CAST(NULL AS STRING))) + +-- NULL in a column +query spark_answer_only +SELECT hex(encode(s, 'UTF-8')) FROM test_encode WHERE s IS NULL + +-- ============================================================================ +-- Binary input (Spark implicitly casts BINARY to STRING, invalid UTF-8 bytes +-- become U+FFFD which is EF BF BD in UTF-8) +-- ============================================================================ + +-- valid UTF-8 binary round-trips +query spark_answer_only +SELECT hex(encode(CAST(x'48656C6C6F' AS BINARY), 'UTF-8')) + +-- invalid UTF-8 binary: each invalid byte becomes U+FFFD +query spark_answer_only +SELECT hex(encode(CAST(x'FFFE' AS BINARY), 'UTF-8')) + +-- binary column input +query spark_answer_only +SELECT hex(encode(b, 'UTF-8')) FROM test_encode WHERE b IS NOT NULL diff --git a/spark/src/test/resources/sql-tests/expressions/string/encode_strict.sql b/spark/src/test/resources/sql-tests/expressions/string/encode_strict.sql new file mode 100644 index 0000000000..a085775c96 --- /dev/null +++ b/spark/src/test/resources/sql-tests/expressions/string/encode_strict.sql @@ -0,0 +1,88 @@ +-- Licensed to the Apache Software Foundation (ASF) under one +-- or more contributor license agreements. See the NOTICE file +-- distributed with this work for additional information +-- regarding copyright ownership. The ASF licenses this file +-- to you under the Apache License, Version 2.0 (the +-- "License"); you may not use this file except in compliance +-- with the License. You may obtain a copy of the License at +-- +-- http://www.apache.org/licenses/LICENSE-2.0 +-- +-- Unless required by applicable law or agreed to in writing, +-- software distributed under the License is distributed on an +-- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +-- KIND, either express or implied. See the License for the +-- specific language governing permissions and limitations +-- under the License. + +-- Tests for the strict charset whitelist and raise-on-unmappable behavior that +-- Spark 4.0 enabled by default. Earlier Spark versions have +-- spark.sql.legacy.javaCharsets=true and spark.sql.legacy.codingErrorAction=true +-- by default, which permit extra aliases and replace unmappable characters with +-- '?', so these assertions only hold on Spark 4.0 and later. + +-- MinSparkVersion: 4.0 + +-- ============================================================================ +-- Charset whitelist: Spark accepts exactly us-ascii, iso-8859-1, utf-8, +-- utf-16, utf-16be, utf-16le, utf-32. Anything else raises +-- INVALID_PARAMETER_VALUE.CHARSET. +-- ============================================================================ + +-- UTF-32BE and UTF-32LE are not accepted (only UTF-32 is) +query expect_error(INVALID_PARAMETER_VALUE.CHARSET) +SELECT encode('A', 'UTF-32BE') + +query expect_error(INVALID_PARAMETER_VALUE.CHARSET) +SELECT encode('A', 'UTF-32LE') + +-- Aliases without the hyphen are not accepted +query expect_error(INVALID_PARAMETER_VALUE.CHARSET) +SELECT encode('abc', 'UTF8') + +query expect_error(INVALID_PARAMETER_VALUE.CHARSET) +SELECT encode('abc', 'UTF16') + +query expect_error(INVALID_PARAMETER_VALUE.CHARSET) +SELECT encode('abc', 'UTF16BE') + +-- ASCII without the US- prefix is not accepted +query expect_error(INVALID_PARAMETER_VALUE.CHARSET) +SELECT encode('abc', 'ASCII') + +-- ISO-8859-1 aliases LATIN1 and ISO88591 are not accepted +query expect_error(INVALID_PARAMETER_VALUE.CHARSET) +SELECT encode('abc', 'LATIN1') + +query expect_error(INVALID_PARAMETER_VALUE.CHARSET) +SELECT encode('abc', 'ISO88591') + +-- Completely unknown charsets +query expect_error(INVALID_PARAMETER_VALUE.CHARSET) +SELECT encode('abc', 'EBCDIC') + +-- ============================================================================ +-- Raise on unmappable characters (legacy.codingErrorAction defaults to false) +-- ============================================================================ + +-- U+00E9 (é) is not representable in US-ASCII +query expect_error(MALFORMED_CHARACTER_CODING) +SELECT encode('é', 'US-ASCII') + +-- U+0100 (Ā) is not representable in ISO-8859-1 +query expect_error(MALFORMED_CHARACTER_CODING) +SELECT encode(CAST(x'C480' AS BINARY), 'ISO-8859-1') + +-- emoji is not representable in US-ASCII +query expect_error(MALFORMED_CHARACTER_CODING) +SELECT encode('😀', 'US-ASCII') + +-- column argument with an unmappable value also raises +statement +CREATE TABLE test_encode_unmappable(s string) USING parquet + +statement +INSERT INTO test_encode_unmappable VALUES ('é') + +query expect_error(MALFORMED_CHARACTER_CODING) +SELECT encode(s, 'US-ASCII') FROM test_encode_unmappable