From 0d917f33c3436b7b2e269fb58a17bb996e6e2395 Mon Sep 17 00:00:00 2001 From: whisper33z Date: Wed, 21 Jan 2026 18:56:14 +0800 Subject: [PATCH 01/15] Add levenshtein function --- .../exprs/function/function_levenshtein.cpp | 165 ++++++++++++++++++ .../exprs/function/simple_function_factory.h | 2 + .../doris/catalog/BuiltinScalarFunctions.java | 2 + .../executable/StringArithmetic.java | 47 +++++ .../functions/scalar/Levenshtein.java | 76 ++++++++ .../visitor/ScalarFunctionVisitor.java | 5 + .../string_functions/test_string_all.out | 7 +- .../string_functions/test_string_all.groovy | 8 +- 8 files changed, 310 insertions(+), 2 deletions(-) create mode 100644 be/src/exprs/function/function_levenshtein.cpp create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Levenshtein.java diff --git a/be/src/exprs/function/function_levenshtein.cpp b/be/src/exprs/function/function_levenshtein.cpp new file mode 100644 index 00000000000000..5dbc042df2ef11 --- /dev/null +++ b/be/src/exprs/function/function_levenshtein.cpp @@ -0,0 +1,165 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "common/status.h" +#include "util/simd/vstring_function.h" +#include "vec/columns/column_string.h" +#include "vec/common/string_ref.h" +#include "vec/data_types/data_type_number.h" +#include "vec/functions/function.h" +#include "vec/functions/simple_function_factory.h" + +namespace doris::vectorized { +#include "common/compile_check_begin.h" + +class FunctionLevenshtein : public IFunction { +public: + static constexpr auto name = "levenshtein"; + + static FunctionPtr create() { return std::make_shared(); } + + String get_name() const override { return name; } + + size_t get_number_of_arguments() const override { return 2; } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + return std::make_shared(); + } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + uint32_t result, size_t input_rows_count) const override { + const ColumnPtr left_col = block.get_by_position(arguments[0]).column; + const ColumnPtr right_col = block.get_by_position(arguments[1]).column; + + auto res_column = ColumnInt32::create(input_rows_count); + auto& res_data = res_column->get_data(); + + for (size_t i = 0; i < input_rows_count; ++i) { + const StringRef left = left_col->get_data_at(i); + const StringRef right = right_col->get_data_at(i); + res_data[i] = levenshtein_distance(left, right); + } + + block.replace_by_position(result, std::move(res_column)); + return Status::OK(); + } + +private: + static void utf8_char_offsets(const StringRef& ref, std::vector& offsets) { + offsets.clear(); + offsets.reserve(ref.size); + const char* data = ref.data; + size_t size = ref.size; + size_t i = 0; + while (i < size) { + offsets.push_back(i); + uint8_t char_len = UTF8_BYTE_LENGTH[static_cast(data[i])]; + if (char_len == 0) { + char_len = 1; + } + if (i + char_len > size) { + char_len = static_cast(size - i); + } + i += char_len; + } + } + + static inline bool utf8_char_equal(const StringRef& left, size_t left_off, size_t left_next, + const StringRef& right, size_t right_off, + size_t right_next) { + size_t left_len = left_next - left_off; + size_t right_len = right_next - right_off; + if (left_len != right_len) { + return false; + } + return std::memcmp(left.data + left_off, right.data + right_off, left_len) == 0; + } + + static int levenshtein_distance(const StringRef& left, const StringRef& right) { + if (left.size == 0) { + return static_cast( + simd::VStringFunctions::get_char_len(right.data, right.size)); + } + if (right.size == 0) { + return static_cast( + simd::VStringFunctions::get_char_len(left.data, left.size)); + } + + std::vector left_offsets; + std::vector right_offsets; + utf8_char_offsets(left, left_offsets); + utf8_char_offsets(right, right_offsets); + + const StringRef* left_ref = &left; + const StringRef* right_ref = &right; + if (right_offsets.size() > left_offsets.size()) { + std::swap(left_offsets, right_offsets); + std::swap(left_ref, right_ref); + } + + const size_t m = left_offsets.size(); + const size_t n = right_offsets.size(); + + if (m == 0) { + return static_cast(n); + } + if (n == 0) { + return static_cast(m); + } + + std::vector prev(n + 1); + std::vector curr(n + 1); + for (size_t j = 0; j <= n; ++j) { + prev[j] = static_cast(j); + } + + for (size_t i = 1; i <= m; ++i) { + curr[0] = static_cast(i); + size_t left_off = left_offsets[i - 1]; + size_t left_next = (i < m) ? left_offsets[i] : left_ref->size; + + for (size_t j = 1; j <= n; ++j) { + size_t right_off = right_offsets[j - 1]; + size_t right_next = (j < n) ? right_offsets[j] : right_ref->size; + + int cost = utf8_char_equal(*left_ref, left_off, left_next, *right_ref, right_off, + right_next) + ? 0 + : 1; + + int insert_cost = curr[j - 1] + 1; + int delete_cost = prev[j] + 1; + int replace_cost = prev[j - 1] + cost; + curr[j] = std::min({insert_cost, delete_cost, replace_cost}); + } + std::swap(prev, curr); + } + + return prev[n]; + } +}; + +void register_function_levenshtein(SimpleFunctionFactory& factory) { + factory.register_function(); +} + +#include "common/compile_check_end.h" +} // namespace doris::vectorized diff --git a/be/src/exprs/function/simple_function_factory.h b/be/src/exprs/function/simple_function_factory.h index c1ebcc34535c67..de3ff92b5919a8 100644 --- a/be/src/exprs/function/simple_function_factory.h +++ b/be/src/exprs/function/simple_function_factory.h @@ -120,6 +120,7 @@ void register_function_ai(SimpleFunctionFactory& factory); void register_function_score(SimpleFunctionFactory& factory); void register_function_variant_type(SimpleFunctionFactory& factory); void register_function_binary(SimpleFunctionFactory& factory); +void register_function_levenshtein(SimpleFunctionFactory& factory); void register_function_soundex(SimpleFunctionFactory& factory); #if defined(BE_TEST) && !defined(BE_BENCHMARK) @@ -356,6 +357,7 @@ class SimpleFunctionFactory { register_function_ai(instance); register_function_score(instance); register_function_binary(instance); + register_function_levenshtein(instance); register_function_soundex(instance); register_function_json_transform(instance); register_function_json_hash(instance); diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java index f7b21c7dfbf095..8284b09a51e3b5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java @@ -316,6 +316,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Least; import org.apache.doris.nereids.trees.expressions.functions.scalar.Left; import org.apache.doris.nereids.trees.expressions.functions.scalar.Length; +import org.apache.doris.nereids.trees.expressions.functions.scalar.Levenshtein; import org.apache.doris.nereids.trees.expressions.functions.scalar.Ln; import org.apache.doris.nereids.trees.expressions.functions.scalar.Locate; import org.apache.doris.nereids.trees.expressions.functions.scalar.Log; @@ -882,6 +883,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(LastQueryId.class, "last_query_id"), scalar(Lcm.class, "lcm"), scalar(Least.class, "least"), + scalar(Levenshtein.class, "levenshtein"), scalar(Left.class, "left", "strleft"), scalar(Length.class, "length", "octet_length"), scalar(Crc32.class, "crc32"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java index 0172c3b433940f..f0bc0eb01d0162 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java @@ -1123,6 +1123,53 @@ public static Expression soundex(StringLikeLiteral first) { return castStringLikeLiteral(first, result); } + /** + * Executable arithmetic functions levenshtein + */ + @ExecFunction(name = "levenshtein") + public static Expression levenshtein(StringLikeLiteral first, StringLikeLiteral second) { + int[] left = first.getValue().codePoints().toArray(); + int[] right = second.getValue().codePoints().toArray(); + + if (right.length > left.length) { + int[] tmp = left; + left = right; + right = tmp; + } + + int m = left.length; + int n = right.length; + if (n == 0) { + return new IntegerLiteral(m); + } + if (m == 0) { + return new IntegerLiteral(n); + } + + int[] prev = new int[n + 1]; + int[] curr = new int[n + 1]; + for (int j = 0; j <= n; j++) { + prev[j] = j; + } + + for (int i = 1; i <= m; i++) { + curr[0] = i; + int leftChar = left[i - 1]; + for (int j = 1; j <= n; j++) { + int cost = leftChar == right[j - 1] ? 0 : 1; + int insertCost = curr[j - 1] + 1; + int deleteCost = prev[j] + 1; + int replaceCost = prev[j - 1] + cost; + curr[j] = Math.min(insertCost, Math.min(deleteCost, replaceCost)); + } + int[] tmp = prev; + prev = curr; + curr = tmp; + } + + return new IntegerLiteral(prev[n]); + } + /** * Executable arithmetic functions make_set */ diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Levenshtein.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Levenshtein.java new file mode 100644 index 00000000000000..c1095b27a26262 --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/Levenshtein.java @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.shape.BinaryExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.IntegerType; +import org.apache.doris.nereids.types.StringType; +import org.apache.doris.nereids.types.VarcharType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'levenshtein'. + */ +public class Levenshtein extends ScalarFunction + implements BinaryExpression, ExplicitlyCastableSignature, PropagateNullable { + + public static final List SIGNATURES = ImmutableList.of( + FunctionSignature.ret(IntegerType.INSTANCE).args(VarcharType.SYSTEM_DEFAULT, VarcharType.SYSTEM_DEFAULT), + FunctionSignature.ret(IntegerType.INSTANCE).args(StringType.INSTANCE, StringType.INSTANCE) + ); + + /** + * constructor with 2 arguments. + */ + public Levenshtein(Expression arg0, Expression arg1) { + super("levenshtein", arg0, arg1); + } + + /** constructor for withChildren and reuse signature */ + private Levenshtein(ScalarFunctionParams functionParams) { + super(functionParams); + } + + /** + * withChildren. + */ + @Override + public Levenshtein withChildren(List children) { + Preconditions.checkArgument(children.size() == 2); + return new Levenshtein(getFunctionParams(children)); + } + + @Override + public List getSignatures() { + return SIGNATURES; + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitLevenshtein(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java index a20abfeae853c7..7d1ebfba6165b6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java @@ -336,6 +336,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Least; import org.apache.doris.nereids.trees.expressions.functions.scalar.Left; import org.apache.doris.nereids.trees.expressions.functions.scalar.Length; +import org.apache.doris.nereids.trees.expressions.functions.scalar.Levenshtein; import org.apache.doris.nereids.trees.expressions.functions.scalar.Ln; import org.apache.doris.nereids.trees.expressions.functions.scalar.Locate; import org.apache.doris.nereids.trees.expressions.functions.scalar.Log; @@ -1885,6 +1886,10 @@ default R visitLocate(Locate locate, C context) { return visitScalarFunction(locate, context); } + default R visitLevenshtein(Levenshtein levenshtein, C context) { + return visitScalarFunction(levenshtein, context); + } + default R visitLog(Log log, C context) { return visitScalarFunction(log, context); } diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out b/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out index d126d2cd8ea602..02438962c664aa 100644 --- a/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out +++ b/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out @@ -965,6 +965,12 @@ S530 S530 -- !soundex_330 -- R163 R163 +-- !levenshtein_331 -- +0 3 2 1 1 + +-- !levenshtein_332 -- +0 3 3 \N \N + -- !space_333 -- @@ -1411,4 +1417,3 @@ Hello Test123 -- !xpath_string_486 -- 123 - diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy index 9d7123b03a79d4..313131465e3d31 100644 --- a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy +++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy @@ -753,6 +753,12 @@ suite("string_functions_all") { testFoldConst("SELECT soundex('R@b-e123rt'), soundex('Robert');") // SOUNDEX tests with non-ASCII characters - Skipped (not supported) + // LEVENSHTEIN tests + qt_levenshtein_331 "SELECT levenshtein('', ''), levenshtein('kitten', 'sitting'), levenshtein('flaw', 'lawn'), levenshtein('你好', '你们'), levenshtein('数据库', '数据');" + testFoldConst("SELECT levenshtein('', ''), levenshtein('kitten', 'sitting'), levenshtein('flaw', 'lawn'), levenshtein('你好', '你们'), levenshtein('数据库', '数据');") + qt_levenshtein_332 "SELECT levenshtein('abc', 'abc'), levenshtein('abc', ''), levenshtein('', 'abc'), levenshtein(NULL, 'abc'), levenshtein('abc', NULL);" + testFoldConst("SELECT levenshtein('abc', 'abc'), levenshtein('abc', ''), levenshtein('', 'abc'), levenshtein(NULL, 'abc'), levenshtein('abc', NULL);") + // SPACE tests qt_space_333 "SELECT space(5);" testFoldConst("SELECT space(5);") @@ -1092,4 +1098,4 @@ suite("string_functions_all") { testFoldConst("SELECT xpath_string(NULL, '/a');") qt_xpath_string_486 "SELECT xpath_string('123', '/a');" testFoldConst("SELECT xpath_string('123', '/a');") -} \ No newline at end of file +} From 5638e3461cf1c0592732b1a58384d9bc5a032443 Mon Sep 17 00:00:00 2001 From: whisper33z Date: Thu, 29 Jan 2026 00:11:57 +0800 Subject: [PATCH 02/15] Add hamming_distance function --- be/src/exprs/function/function_string.cpp | 123 ++++++++++++++++++ .../doris/catalog/BuiltinScalarFunctions.java | 2 + .../executable/StringArithmetic.java | 21 +++ .../functions/scalar/HammingDistance.java | 78 +++++++++++ .../visitor/ScalarFunctionVisitor.java | 5 + .../string_functions/test_string_all.out | 6 + .../string_functions/test_string_all.groovy | 6 + 7 files changed, 241 insertions(+) create mode 100644 fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/HammingDistance.java diff --git a/be/src/exprs/function/function_string.cpp b/be/src/exprs/function/function_string.cpp index 053921ce4fa0d6..b6320de5e1e219 100644 --- a/be/src/exprs/function/function_string.cpp +++ b/be/src/exprs/function/function_string.cpp @@ -46,6 +46,38 @@ namespace doris { #include "common/compile_check_begin.h" + +namespace { +void utf8_char_offsets(const StringRef& ref, std::vector& offsets) { + offsets.clear(); + offsets.reserve(ref.size); + const char* data = ref.data; + size_t size = ref.size; + size_t i = 0; + while (i < size) { + offsets.push_back(i); + uint8_t char_len = UTF8_BYTE_LENGTH[static_cast(data[i])]; + if (char_len == 0) { + char_len = 1; + } + if (i + char_len > size) { + char_len = static_cast(size - i); + } + i += char_len; + } +} + +inline bool utf8_char_equal(const StringRef& left, size_t left_off, size_t left_next, + const StringRef& right, size_t right_off, size_t right_next) { + size_t left_len = left_next - left_off; + size_t right_len = right_next - right_off; + if (left_len != right_len) { + return false; + } + return std::memcmp(left.data + left_off, right.data + right_off, left_len) == 0; +} +} // namespace + struct NameStringASCII { static constexpr auto name = "ascii"; }; @@ -415,6 +447,10 @@ struct NameLocate { static constexpr auto name = "locate"; }; +struct NameHammingDistance { + static constexpr auto name = "hamming_distance"; +}; + // LeftDataType and RightDataType are DataTypeString template struct StringLocateImpl { @@ -502,6 +538,89 @@ struct StringFunctionImpl { } }; +// LeftDataType and RightDataType are DataTypeString +template +struct HammingDistanceImpl { + using ResultDataType = DataTypeInt64; + using ResultPaddedPODArray = PaddedPODArray; + + static Status vector_vector(const ColumnString::Chars& ldata, + const ColumnString::Offsets& loffsets, + const ColumnString::Chars& rdata, + const ColumnString::Offsets& roffsets, + ResultPaddedPODArray& res) { + DCHECK_EQ(loffsets.size(), roffsets.size()); + auto size = loffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + const char* l_raw_str = reinterpret_cast(&ldata[loffsets[i - 1]]); + int l_str_size = loffsets[i] - loffsets[i - 1]; + const char* r_raw_str = reinterpret_cast(&rdata[roffsets[i - 1]]); + int r_str_size = roffsets[i] - roffsets[i - 1]; + const StringRef lref(l_raw_str, l_str_size); + const StringRef rref(r_raw_str, r_str_size); + RETURN_IF_ERROR(hamming_distance(lref, rref, res[i], i)); + } + return Status::OK(); + } + + static Status vector_scalar(const ColumnString::Chars& ldata, + const ColumnString::Offsets& loffsets, const StringRef& rdata, + ResultPaddedPODArray& res) { + auto size = loffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + const char* l_raw_str = reinterpret_cast(&ldata[loffsets[i - 1]]); + int l_str_size = loffsets[i] - loffsets[i - 1]; + const StringRef lref(l_raw_str, l_str_size); + RETURN_IF_ERROR(hamming_distance(lref, rdata, res[i], i)); + } + return Status::OK(); + } + + static Status scalar_vector(const StringRef& ldata, const ColumnString::Chars& rdata, + const ColumnString::Offsets& roffsets, + ResultPaddedPODArray& res) { + auto size = roffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + const char* r_raw_str = reinterpret_cast(&rdata[roffsets[i - 1]]); + int r_str_size = roffsets[i] - roffsets[i - 1]; + const StringRef rref(r_raw_str, r_str_size); + RETURN_IF_ERROR(hamming_distance(ldata, rref, res[i], i)); + } + return Status::OK(); + } + +private: + static Status hamming_distance(const StringRef& left, const StringRef& right, Int64& result, + size_t row) { + std::vector left_offsets; + std::vector right_offsets; + utf8_char_offsets(left, left_offsets); + utf8_char_offsets(right, right_offsets); + + if (left_offsets.size() != right_offsets.size()) { + return Status::InvalidArgument( + "hamming_distance requires strings of the same length at row {}", row); + } + + Int64 distance = 0; + const size_t len = left_offsets.size(); + for (size_t i = 0; i < len; ++i) { + size_t left_off = left_offsets[i]; + size_t left_next = (i + 1 < len) ? left_offsets[i + 1] : left.size; + size_t right_off = right_offsets[i]; + size_t right_next = (i + 1 < len) ? right_offsets[i + 1] : right.size; + if (!utf8_char_equal(left, left_off, left_next, right, right_off, right_next)) { + ++distance; + } + } + result = distance; + return Status::OK(); + } +}; + struct NameToLower { static constexpr auto name = "lower"; }; @@ -1326,6 +1445,9 @@ using FunctionStringLocate = FunctionBinaryToType; using FunctionStringFindInSet = FunctionBinaryToType; +using FunctionHammingDistance = + FunctionBinaryToType; using FunctionQuote = FunctionStringToString; @@ -1360,6 +1482,7 @@ void register_function_string(SimpleFunctionFactory& factory) { factory.register_function(); factory.register_function(); factory.register_function(); + factory.register_function(); factory.register_function(); factory.register_function(); factory.register_function(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java index 8284b09a51e3b5..bd2c45601d4b4a 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/BuiltinScalarFunctions.java @@ -234,6 +234,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.Greatest; import org.apache.doris.nereids.trees.expressions.functions.scalar.Grouping; import org.apache.doris.nereids.trees.expressions.functions.scalar.GroupingId; +import org.apache.doris.nereids.trees.expressions.functions.scalar.HammingDistance; import org.apache.doris.nereids.trees.expressions.functions.scalar.Hex; import org.apache.doris.nereids.trees.expressions.functions.scalar.HllCardinality; import org.apache.doris.nereids.trees.expressions.functions.scalar.HllEmpty; @@ -798,6 +799,7 @@ public class BuiltinScalarFunctions implements FunctionHelper { scalar(Greatest.class, "greatest"), scalar(Grouping.class, "grouping"), scalar(GroupingId.class, "grouping_id"), + scalar(HammingDistance.class, "hamming_distance"), scalar(Hex.class, "hex"), scalar(HllCardinality.class, "hll_cardinality"), scalar(HllEmpty.class, "hll_empty"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java index f0bc0eb01d0162..570d0bb4b98c95 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/executable/StringArithmetic.java @@ -1170,6 +1170,27 @@ public static Expression levenshtein(StringLikeLiteral first, StringLikeLiteral return new IntegerLiteral(prev[n]); } + /** + * Executable arithmetic functions hamming_distance + */ + @ExecFunction(name = "hamming_distance") + public static Expression hammingDistance(StringLikeLiteral first, StringLikeLiteral second) { + int[] left = first.getValue().codePoints().toArray(); + int[] right = second.getValue().codePoints().toArray(); + + if (left.length != right.length) { + throw new AnalysisException("hamming_distance requires strings of the same length"); + } + + long distance = 0; + for (int i = 0; i < left.length; i++) { + if (left[i] != right[i]) { + distance++; + } + } + return new BigIntLiteral(distance); + } + /** * Executable arithmetic functions make_set */ diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/HammingDistance.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/HammingDistance.java new file mode 100644 index 00000000000000..a874ed7a912f2d --- /dev/null +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/functions/scalar/HammingDistance.java @@ -0,0 +1,78 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.nereids.trees.expressions.functions.scalar; + +import org.apache.doris.catalog.FunctionSignature; +import org.apache.doris.nereids.trees.expressions.Expression; +import org.apache.doris.nereids.trees.expressions.functions.ExplicitlyCastableSignature; +import org.apache.doris.nereids.trees.expressions.functions.PropagateNullable; +import org.apache.doris.nereids.trees.expressions.shape.BinaryExpression; +import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor; +import org.apache.doris.nereids.types.BigIntType; +import org.apache.doris.nereids.types.StringType; +import org.apache.doris.nereids.types.VarcharType; + +import com.google.common.base.Preconditions; +import com.google.common.collect.ImmutableList; + +import java.util.List; + +/** + * ScalarFunction 'hamming_distance'. + */ +public class HammingDistance extends ScalarFunction + implements BinaryExpression, ExplicitlyCastableSignature, PropagateNullable { + + public static final List SIGNATURES = ImmutableList.of( + FunctionSignature.ret(BigIntType.INSTANCE) + .args(VarcharType.SYSTEM_DEFAULT, VarcharType.SYSTEM_DEFAULT), + FunctionSignature.ret(BigIntType.INSTANCE) + .args(StringType.INSTANCE, StringType.INSTANCE) + ); + + /** + * constructor with 2 arguments. + */ + public HammingDistance(Expression arg0, Expression arg1) { + super("hamming_distance", arg0, arg1); + } + + /** constructor for withChildren and reuse signature */ + private HammingDistance(ScalarFunctionParams functionParams) { + super(functionParams); + } + + /** + * withChildren. + */ + @Override + public HammingDistance withChildren(List children) { + Preconditions.checkArgument(children.size() == 2); + return new HammingDistance(getFunctionParams(children)); + } + + @Override + public List getSignatures() { + return SIGNATURES; + } + + @Override + public R accept(ExpressionVisitor visitor, C context) { + return visitor.visitHammingDistance(this, context); + } +} diff --git a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java index 7d1ebfba6165b6..26f1ab235dea44 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java +++ b/fe/fe-core/src/main/java/org/apache/doris/nereids/trees/expressions/visitor/ScalarFunctionVisitor.java @@ -248,6 +248,7 @@ import org.apache.doris.nereids.trees.expressions.functions.scalar.GetFormat; import org.apache.doris.nereids.trees.expressions.functions.scalar.GetVariantType; import org.apache.doris.nereids.trees.expressions.functions.scalar.Greatest; +import org.apache.doris.nereids.trees.expressions.functions.scalar.HammingDistance; import org.apache.doris.nereids.trees.expressions.functions.scalar.Hex; import org.apache.doris.nereids.trees.expressions.functions.scalar.HllCardinality; import org.apache.doris.nereids.trees.expressions.functions.scalar.HllEmpty; @@ -1886,6 +1887,10 @@ default R visitLocate(Locate locate, C context) { return visitScalarFunction(locate, context); } + default R visitHammingDistance(HammingDistance hammingDistance, C context) { + return visitScalarFunction(hammingDistance, context); + } + default R visitLevenshtein(Levenshtein levenshtein, C context) { return visitScalarFunction(levenshtein, context); } diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out b/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out index 02438962c664aa..b5f089287d59e1 100644 --- a/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out +++ b/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out @@ -971,6 +971,12 @@ R163 R163 -- !levenshtein_332 -- 0 3 3 \N \N +-- !hamming_distance_3331 -- +0 0 1 1 + +-- !hamming_distance_3332 -- +0 \N \N + -- !space_333 -- diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy index 313131465e3d31..8709863f480bfa 100644 --- a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy +++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy @@ -759,6 +759,12 @@ suite("string_functions_all") { qt_levenshtein_332 "SELECT levenshtein('abc', 'abc'), levenshtein('abc', ''), levenshtein('', 'abc'), levenshtein(NULL, 'abc'), levenshtein('abc', NULL);" testFoldConst("SELECT levenshtein('abc', 'abc'), levenshtein('abc', ''), levenshtein('', 'abc'), levenshtein(NULL, 'abc'), levenshtein('abc', NULL);") + // HAMMING_DISTANCE tests + qt_hamming_distance_3331 "SELECT hamming_distance('', ''), hamming_distance('abc', 'abc'), hamming_distance('abc', 'abd'), hamming_distance('你好', '你们');" + testFoldConst("SELECT hamming_distance('', ''), hamming_distance('abc', 'abc'), hamming_distance('abc', 'abd'), hamming_distance('你好', '你们');") + qt_hamming_distance_3332 "SELECT hamming_distance('abc', 'abc'), hamming_distance(NULL, 'abc'), hamming_distance('abc', NULL);" + testFoldConst("SELECT hamming_distance('abc', 'abc'), hamming_distance(NULL, 'abc'), hamming_distance('abc', NULL);") + // SPACE tests qt_space_333 "SELECT space(5);" testFoldConst("SELECT space(5);") From 860b769530b72e0f46a027b25c1eeac8c253524c Mon Sep 17 00:00:00 2001 From: whisper33z Date: Thu, 29 Jan 2026 22:02:08 +0800 Subject: [PATCH 03/15] Move hamming_distance to standalone BE function file --- .../function/function_hamming_distance.cpp | 129 ++++++++++++++++++ be/src/exprs/function/function_string.cpp | 123 ----------------- .../exprs/function/simple_function_factory.h | 2 + 3 files changed, 131 insertions(+), 123 deletions(-) create mode 100644 be/src/exprs/function/function_hamming_distance.cpp diff --git a/be/src/exprs/function/function_hamming_distance.cpp b/be/src/exprs/function/function_hamming_distance.cpp new file mode 100644 index 00000000000000..2c1e6d9f12404d --- /dev/null +++ b/be/src/exprs/function/function_hamming_distance.cpp @@ -0,0 +1,129 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include "common/status.h" +#include "util/simd/vstring_function.h" +#include "vec/columns/column_string.h" +#include "vec/common/string_ref.h" +#include "vec/data_types/data_type_number.h" +#include "vec/functions/function.h" +#include "vec/functions/simple_function_factory.h" + +namespace doris::vectorized { +#include "common/compile_check_begin.h" + +class FunctionHammingDistance : public IFunction { +public: + static constexpr auto name = "hamming_distance"; + + static FunctionPtr create() { return std::make_shared(); } + + String get_name() const override { return name; } + + size_t get_number_of_arguments() const override { return 2; } + + DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { + return std::make_shared(); + } + + Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, + uint32_t result, size_t input_rows_count) const override { + const ColumnPtr left_col = block.get_by_position(arguments[0]).column; + const ColumnPtr right_col = block.get_by_position(arguments[1]).column; + + auto res_column = ColumnInt64::create(input_rows_count); + auto& res_data = res_column->get_data(); + + for (size_t i = 0; i < input_rows_count; ++i) { + const StringRef left = left_col->get_data_at(i); + const StringRef right = right_col->get_data_at(i); + RETURN_IF_ERROR(hamming_distance(left, right, res_data[i], i)); + } + + block.replace_by_position(result, std::move(res_column)); + return Status::OK(); + } + +private: + static void utf8_char_offsets(const StringRef& ref, std::vector& offsets) { + offsets.clear(); + offsets.reserve(ref.size); + const char* data = ref.data; + size_t size = ref.size; + size_t i = 0; + while (i < size) { + offsets.push_back(i); + uint8_t char_len = UTF8_BYTE_LENGTH[static_cast(data[i])]; + if (char_len == 0) { + char_len = 1; + } + if (i + char_len > size) { + char_len = static_cast(size - i); + } + i += char_len; + } + } + + static inline bool utf8_char_equal(const StringRef& left, size_t left_off, size_t left_next, + const StringRef& right, size_t right_off, + size_t right_next) { + size_t left_len = left_next - left_off; + size_t right_len = right_next - right_off; + if (left_len != right_len) { + return false; + } + return std::memcmp(left.data + left_off, right.data + right_off, left_len) == 0; + } + + static Status hamming_distance(const StringRef& left, const StringRef& right, Int64& result, + size_t row) { + std::vector left_offsets; + std::vector right_offsets; + utf8_char_offsets(left, left_offsets); + utf8_char_offsets(right, right_offsets); + + if (left_offsets.size() != right_offsets.size()) { + return Status::InvalidArgument( + "hamming_distance requires strings of the same length at row {}", row); + } + + Int64 distance = 0; + const size_t len = left_offsets.size(); + for (size_t i = 0; i < len; ++i) { + size_t left_off = left_offsets[i]; + size_t left_next = (i + 1 < len) ? left_offsets[i + 1] : left.size; + size_t right_off = right_offsets[i]; + size_t right_next = (i + 1 < len) ? right_offsets[i + 1] : right.size; + if (!utf8_char_equal(left, left_off, left_next, right, right_off, right_next)) { + ++distance; + } + } + result = distance; + return Status::OK(); + } +}; + +void register_function_hamming_distance(SimpleFunctionFactory& factory) { + factory.register_function(); +} + +#include "common/compile_check_end.h" +} // namespace doris::vectorized diff --git a/be/src/exprs/function/function_string.cpp b/be/src/exprs/function/function_string.cpp index b6320de5e1e219..c131ff23e805b9 100644 --- a/be/src/exprs/function/function_string.cpp +++ b/be/src/exprs/function/function_string.cpp @@ -47,37 +47,6 @@ namespace doris { #include "common/compile_check_begin.h" -namespace { -void utf8_char_offsets(const StringRef& ref, std::vector& offsets) { - offsets.clear(); - offsets.reserve(ref.size); - const char* data = ref.data; - size_t size = ref.size; - size_t i = 0; - while (i < size) { - offsets.push_back(i); - uint8_t char_len = UTF8_BYTE_LENGTH[static_cast(data[i])]; - if (char_len == 0) { - char_len = 1; - } - if (i + char_len > size) { - char_len = static_cast(size - i); - } - i += char_len; - } -} - -inline bool utf8_char_equal(const StringRef& left, size_t left_off, size_t left_next, - const StringRef& right, size_t right_off, size_t right_next) { - size_t left_len = left_next - left_off; - size_t right_len = right_next - right_off; - if (left_len != right_len) { - return false; - } - return std::memcmp(left.data + left_off, right.data + right_off, left_len) == 0; -} -} // namespace - struct NameStringASCII { static constexpr auto name = "ascii"; }; @@ -447,10 +416,6 @@ struct NameLocate { static constexpr auto name = "locate"; }; -struct NameHammingDistance { - static constexpr auto name = "hamming_distance"; -}; - // LeftDataType and RightDataType are DataTypeString template struct StringLocateImpl { @@ -538,89 +503,6 @@ struct StringFunctionImpl { } }; -// LeftDataType and RightDataType are DataTypeString -template -struct HammingDistanceImpl { - using ResultDataType = DataTypeInt64; - using ResultPaddedPODArray = PaddedPODArray; - - static Status vector_vector(const ColumnString::Chars& ldata, - const ColumnString::Offsets& loffsets, - const ColumnString::Chars& rdata, - const ColumnString::Offsets& roffsets, - ResultPaddedPODArray& res) { - DCHECK_EQ(loffsets.size(), roffsets.size()); - auto size = loffsets.size(); - res.resize(size); - for (size_t i = 0; i < size; ++i) { - const char* l_raw_str = reinterpret_cast(&ldata[loffsets[i - 1]]); - int l_str_size = loffsets[i] - loffsets[i - 1]; - const char* r_raw_str = reinterpret_cast(&rdata[roffsets[i - 1]]); - int r_str_size = roffsets[i] - roffsets[i - 1]; - const StringRef lref(l_raw_str, l_str_size); - const StringRef rref(r_raw_str, r_str_size); - RETURN_IF_ERROR(hamming_distance(lref, rref, res[i], i)); - } - return Status::OK(); - } - - static Status vector_scalar(const ColumnString::Chars& ldata, - const ColumnString::Offsets& loffsets, const StringRef& rdata, - ResultPaddedPODArray& res) { - auto size = loffsets.size(); - res.resize(size); - for (size_t i = 0; i < size; ++i) { - const char* l_raw_str = reinterpret_cast(&ldata[loffsets[i - 1]]); - int l_str_size = loffsets[i] - loffsets[i - 1]; - const StringRef lref(l_raw_str, l_str_size); - RETURN_IF_ERROR(hamming_distance(lref, rdata, res[i], i)); - } - return Status::OK(); - } - - static Status scalar_vector(const StringRef& ldata, const ColumnString::Chars& rdata, - const ColumnString::Offsets& roffsets, - ResultPaddedPODArray& res) { - auto size = roffsets.size(); - res.resize(size); - for (size_t i = 0; i < size; ++i) { - const char* r_raw_str = reinterpret_cast(&rdata[roffsets[i - 1]]); - int r_str_size = roffsets[i] - roffsets[i - 1]; - const StringRef rref(r_raw_str, r_str_size); - RETURN_IF_ERROR(hamming_distance(ldata, rref, res[i], i)); - } - return Status::OK(); - } - -private: - static Status hamming_distance(const StringRef& left, const StringRef& right, Int64& result, - size_t row) { - std::vector left_offsets; - std::vector right_offsets; - utf8_char_offsets(left, left_offsets); - utf8_char_offsets(right, right_offsets); - - if (left_offsets.size() != right_offsets.size()) { - return Status::InvalidArgument( - "hamming_distance requires strings of the same length at row {}", row); - } - - Int64 distance = 0; - const size_t len = left_offsets.size(); - for (size_t i = 0; i < len; ++i) { - size_t left_off = left_offsets[i]; - size_t left_next = (i + 1 < len) ? left_offsets[i + 1] : left.size; - size_t right_off = right_offsets[i]; - size_t right_next = (i + 1 < len) ? right_offsets[i + 1] : right.size; - if (!utf8_char_equal(left, left_off, left_next, right, right_off, right_next)) { - ++distance; - } - } - result = distance; - return Status::OK(); - } -}; - struct NameToLower { static constexpr auto name = "lower"; }; @@ -1445,10 +1327,6 @@ using FunctionStringLocate = FunctionBinaryToType; using FunctionStringFindInSet = FunctionBinaryToType; -using FunctionHammingDistance = - FunctionBinaryToType; - using FunctionQuote = FunctionStringToString; using FunctionToLower = FunctionStringToString, NameToLower>; @@ -1482,7 +1360,6 @@ void register_function_string(SimpleFunctionFactory& factory) { factory.register_function(); factory.register_function(); factory.register_function(); - factory.register_function(); factory.register_function(); factory.register_function(); factory.register_function(); diff --git a/be/src/exprs/function/simple_function_factory.h b/be/src/exprs/function/simple_function_factory.h index de3ff92b5919a8..1d7e26fe5593cf 100644 --- a/be/src/exprs/function/simple_function_factory.h +++ b/be/src/exprs/function/simple_function_factory.h @@ -121,6 +121,7 @@ void register_function_score(SimpleFunctionFactory& factory); void register_function_variant_type(SimpleFunctionFactory& factory); void register_function_binary(SimpleFunctionFactory& factory); void register_function_levenshtein(SimpleFunctionFactory& factory); +void register_function_hamming_distance(SimpleFunctionFactory& factory); void register_function_soundex(SimpleFunctionFactory& factory); #if defined(BE_TEST) && !defined(BE_BENCHMARK) @@ -358,6 +359,7 @@ class SimpleFunctionFactory { register_function_score(instance); register_function_binary(instance); register_function_levenshtein(instance); + register_function_hamming_distance(instance); register_function_soundex(instance); register_function_json_transform(instance); register_function_json_hash(instance); From fb63bc315abbf6cf41336d0eb9f0edcaa1dfb59c Mon Sep 17 00:00:00 2001 From: whisper33z Date: Thu, 29 Jan 2026 22:21:42 +0800 Subject: [PATCH 04/15] chore: remove unused include --- be/src/exprs/function/function_hamming_distance.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/be/src/exprs/function/function_hamming_distance.cpp b/be/src/exprs/function/function_hamming_distance.cpp index 2c1e6d9f12404d..15379937512e0b 100644 --- a/be/src/exprs/function/function_hamming_distance.cpp +++ b/be/src/exprs/function/function_hamming_distance.cpp @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -#include #include #include From 0f7c3e55595bf0b74978ad6bb67e8a460e9e5776 Mon Sep 17 00:00:00 2001 From: whisper33z Date: Mon, 2 Feb 2026 22:34:53 +0800 Subject: [PATCH 05/15] Address review comments for levenshtein/hamming_distance tests --- .../function/function_hamming_distance.cpp | 45 +++++++++--- .../exprs/function/function_levenshtein.cpp | 72 ++++++++++++++++--- .../string_functions/test_string_all.out | 16 ++++- .../string_functions/test_string_all.groovy | 40 ++++++++++- 4 files changed, 150 insertions(+), 23 deletions(-) diff --git a/be/src/exprs/function/function_hamming_distance.cpp b/be/src/exprs/function/function_hamming_distance.cpp index 15379937512e0b..c4505fe0b17fb4 100644 --- a/be/src/exprs/function/function_hamming_distance.cpp +++ b/be/src/exprs/function/function_hamming_distance.cpp @@ -20,7 +20,10 @@ #include "common/status.h" #include "util/simd/vstring_function.h" +#include "vec/columns/column_const.h" +#include "vec/columns/column_nullable.h" #include "vec/columns/column_string.h" +#include "vec/common/typeid_cast.h" #include "vec/common/string_ref.h" #include "vec/data_types/data_type_number.h" #include "vec/functions/function.h" @@ -45,15 +48,25 @@ class FunctionHammingDistance : public IFunction { Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, uint32_t result, size_t input_rows_count) const override { - const ColumnPtr left_col = block.get_by_position(arguments[0]).column; - const ColumnPtr right_col = block.get_by_position(arguments[1]).column; + const auto& [left_col, left_const] = + unpack_if_const(block.get_by_position(arguments[0]).column); + const auto& [right_col, right_const] = + unpack_if_const(block.get_by_position(arguments[1]).column); + const auto* left_str_col = + check_and_get_column(remove_nullable(left_col).get()); + const auto* right_str_col = + check_and_get_column(remove_nullable(right_col).get()); + if (!left_str_col || !right_str_col) { + return Status::NotSupported("Illegal columns {}, {} of argument of function {}", + left_col->get_name(), right_col->get_name(), get_name()); + } auto res_column = ColumnInt64::create(input_rows_count); auto& res_data = res_column->get_data(); for (size_t i = 0; i < input_rows_count; ++i) { - const StringRef left = left_col->get_data_at(i); - const StringRef right = right_col->get_data_at(i); + const StringRef left = left_str_col->get_data_at(left_const ? 0 : i); + const StringRef right = right_str_col->get_data_at(right_const ? 0 : i); RETURN_IF_ERROR(hamming_distance(left, right, res_data[i], i)); } @@ -70,10 +83,8 @@ class FunctionHammingDistance : public IFunction { size_t i = 0; while (i < size) { offsets.push_back(i); - uint8_t char_len = UTF8_BYTE_LENGTH[static_cast(data[i])]; - if (char_len == 0) { - char_len = 1; - } + uint8_t char_len = + doris::get_utf8_byte_length(static_cast(data[i])); if (i + char_len > size) { char_len = static_cast(size - i); } @@ -94,6 +105,19 @@ class FunctionHammingDistance : public IFunction { static Status hamming_distance(const StringRef& left, const StringRef& right, Int64& result, size_t row) { + if (simd::VStringFunctions::is_ascii(left) && simd::VStringFunctions::is_ascii(right)) { + if (left.size != right.size) { + return Status::InvalidArgument( + "hamming_distance requires strings of the same length at row {}", row); + } + Int64 distance = 0; + for (size_t i = 0; i < left.size; ++i) { + distance += static_cast(left.data[i] != right.data[i]); + } + result = distance; + return Status::OK(); + } + std::vector left_offsets; std::vector right_offsets; utf8_char_offsets(left, left_offsets); @@ -111,9 +135,8 @@ class FunctionHammingDistance : public IFunction { size_t left_next = (i + 1 < len) ? left_offsets[i + 1] : left.size; size_t right_off = right_offsets[i]; size_t right_next = (i + 1 < len) ? right_offsets[i + 1] : right.size; - if (!utf8_char_equal(left, left_off, left_next, right, right_off, right_next)) { - ++distance; - } + distance += static_cast( + !utf8_char_equal(left, left_off, left_next, right, right_off, right_next)); } result = distance; return Status::OK(); diff --git a/be/src/exprs/function/function_levenshtein.cpp b/be/src/exprs/function/function_levenshtein.cpp index 5dbc042df2ef11..b3b8b29ded5153 100644 --- a/be/src/exprs/function/function_levenshtein.cpp +++ b/be/src/exprs/function/function_levenshtein.cpp @@ -21,7 +21,10 @@ #include "common/status.h" #include "util/simd/vstring_function.h" +#include "vec/columns/column_const.h" +#include "vec/columns/column_nullable.h" #include "vec/columns/column_string.h" +#include "vec/common/typeid_cast.h" #include "vec/common/string_ref.h" #include "vec/data_types/data_type_number.h" #include "vec/functions/function.h" @@ -46,15 +49,25 @@ class FunctionLevenshtein : public IFunction { Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, uint32_t result, size_t input_rows_count) const override { - const ColumnPtr left_col = block.get_by_position(arguments[0]).column; - const ColumnPtr right_col = block.get_by_position(arguments[1]).column; + const auto& [left_col, left_const] = + unpack_if_const(block.get_by_position(arguments[0]).column); + const auto& [right_col, right_const] = + unpack_if_const(block.get_by_position(arguments[1]).column); + const auto* left_str_col = + check_and_get_column(remove_nullable(left_col).get()); + const auto* right_str_col = + check_and_get_column(remove_nullable(right_col).get()); + if (!left_str_col || !right_str_col) { + return Status::NotSupported("Illegal columns {}, {} of argument of function {}", + left_col->get_name(), right_col->get_name(), get_name()); + } auto res_column = ColumnInt32::create(input_rows_count); auto& res_data = res_column->get_data(); for (size_t i = 0; i < input_rows_count; ++i) { - const StringRef left = left_col->get_data_at(i); - const StringRef right = right_col->get_data_at(i); + const StringRef left = left_str_col->get_data_at(left_const ? 0 : i); + const StringRef right = right_str_col->get_data_at(right_const ? 0 : i); res_data[i] = levenshtein_distance(left, right); } @@ -71,10 +84,8 @@ class FunctionLevenshtein : public IFunction { size_t i = 0; while (i < size) { offsets.push_back(i); - uint8_t char_len = UTF8_BYTE_LENGTH[static_cast(data[i])]; - if (char_len == 0) { - char_len = 1; - } + uint8_t char_len = + doris::get_utf8_byte_length(static_cast(data[i])); if (i + char_len > size) { char_len = static_cast(size - i); } @@ -93,7 +104,52 @@ class FunctionLevenshtein : public IFunction { return std::memcmp(left.data + left_off, right.data + right_off, left_len) == 0; } + static int levenshtein_distance_ascii(const StringRef& left, const StringRef& right) { + const size_t left_len = left.size; + const size_t right_len = right.size; + if (left_len == 0) { + return static_cast(right_len); + } + if (right_len == 0) { + return static_cast(left_len); + } + + const StringRef* left_ref = &left; + const StringRef* right_ref = &right; + size_t m = left_len; + size_t n = right_len; + if (n > m) { + std::swap(left_ref, right_ref); + std::swap(m, n); + } + + std::vector prev(n + 1); + std::vector curr(n + 1); + for (size_t j = 0; j <= n; ++j) { + prev[j] = static_cast(j); + } + + for (size_t i = 1; i <= m; ++i) { + curr[0] = static_cast(i); + const char left_char = left_ref->data[i - 1]; + + for (size_t j = 1; j <= n; ++j) { + const int cost = (left_char == right_ref->data[j - 1]) ? 0 : 1; + const int insert_cost = curr[j - 1] + 1; + const int delete_cost = prev[j] + 1; + const int replace_cost = prev[j - 1] + cost; + curr[j] = std::min({insert_cost, delete_cost, replace_cost}); + } + std::swap(prev, curr); + } + + return prev[n]; + } + static int levenshtein_distance(const StringRef& left, const StringRef& right) { + if (simd::VStringFunctions::is_ascii(left) && simd::VStringFunctions::is_ascii(right)) { + return levenshtein_distance_ascii(left, right); + } if (left.size == 0) { return static_cast( simd::VStringFunctions::get_char_len(right.data, right.size)); diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out b/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out index b5f089287d59e1..64bbf02b63c100 100644 --- a/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out +++ b/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out @@ -971,12 +971,24 @@ R163 R163 -- !levenshtein_332 -- 0 3 3 \N \N --- !hamming_distance_3331 -- +-- !levenshtein_tbl -- +1 3 +2 0 +3 1 +4 \N + +-- !hamming_distance_333 -- 0 0 1 1 --- !hamming_distance_3332 -- +-- !hamming_distance_334 -- 0 \N \N +-- !hamming_distance_tbl -- +1 0 +2 1 +3 1 +4 \N + -- !space_333 -- diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy index 8709863f480bfa..ce70ceed483182 100644 --- a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy +++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy @@ -758,12 +758,48 @@ suite("string_functions_all") { testFoldConst("SELECT levenshtein('', ''), levenshtein('kitten', 'sitting'), levenshtein('flaw', 'lawn'), levenshtein('你好', '你们'), levenshtein('数据库', '数据');") qt_levenshtein_332 "SELECT levenshtein('abc', 'abc'), levenshtein('abc', ''), levenshtein('', 'abc'), levenshtein(NULL, 'abc'), levenshtein('abc', NULL);" testFoldConst("SELECT levenshtein('abc', 'abc'), levenshtein('abc', ''), levenshtein('', 'abc'), levenshtein(NULL, 'abc'), levenshtein('abc', NULL);") + sql """DROP TABLE IF EXISTS string_distance_lv_test""" + sql """ + CREATE TABLE IF NOT EXISTS string_distance_lv_test ( + id int, + s1 VARCHAR, + s2 VARCHAR + ) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_num"="1") + """ + sql """ + insert into string_distance_lv_test values + (1, 'kitten', 'sitting'), + (2, 'abc', 'abc'), + (3, '数据库', '数据'), + (4, null, 'abc') + """ + qt_levenshtein_tbl "SELECT id, levenshtein(s1, s2) FROM string_distance_lv_test ORDER BY id" // HAMMING_DISTANCE tests - qt_hamming_distance_3331 "SELECT hamming_distance('', ''), hamming_distance('abc', 'abc'), hamming_distance('abc', 'abd'), hamming_distance('你好', '你们');" + qt_hamming_distance_333 "SELECT hamming_distance('', ''), hamming_distance('abc', 'abc'), hamming_distance('abc', 'abd'), hamming_distance('你好', '你们');" testFoldConst("SELECT hamming_distance('', ''), hamming_distance('abc', 'abc'), hamming_distance('abc', 'abd'), hamming_distance('你好', '你们');") - qt_hamming_distance_3332 "SELECT hamming_distance('abc', 'abc'), hamming_distance(NULL, 'abc'), hamming_distance('abc', NULL);" + qt_hamming_distance_334 "SELECT hamming_distance('abc', 'abc'), hamming_distance(NULL, 'abc'), hamming_distance('abc', NULL);" testFoldConst("SELECT hamming_distance('abc', 'abc'), hamming_distance(NULL, 'abc'), hamming_distance('abc', NULL);") + sql """DROP TABLE IF EXISTS string_distance_hd_test""" + sql """ + CREATE TABLE IF NOT EXISTS string_distance_hd_test ( + id int, + s1 VARCHAR, + s2 VARCHAR + ) + DISTRIBUTED BY HASH(id) BUCKETS 1 + PROPERTIES ("replication_num"="1") + """ + sql """ + insert into string_distance_hd_test values + (1, 'abc', 'abc'), + (2, 'abc', 'abd'), + (3, '你好', '你们'), + (4, null, 'abc') + """ + qt_hamming_distance_tbl "SELECT id, hamming_distance(s1, s2) FROM string_distance_hd_test ORDER BY id" // SPACE tests qt_space_333 "SELECT space(5);" From f56ea26019059bc8e281eba40a1089ff5020de9f Mon Sep 17 00:00:00 2001 From: whisper33z Date: Fri, 27 Feb 2026 21:53:35 +0800 Subject: [PATCH 06/15] [vec][string] refactor levenshtein/hamming_distance to FunctionBinaryToType Refactor two string distance functions from custom IFunction classes to FunctionBinaryToType-based implementations. Keep existing ASCII fast path, UTF-8 behavior and hamming length validation semantics. This addresses review feedback about using shared binary-function template style. --- .../function/function_hamming_distance.cpp | 120 ++++++------ .../exprs/function/function_levenshtein.cpp | 184 ++++++++---------- 2 files changed, 149 insertions(+), 155 deletions(-) diff --git a/be/src/exprs/function/function_hamming_distance.cpp b/be/src/exprs/function/function_hamming_distance.cpp index c4505fe0b17fb4..7368ea74951e8b 100644 --- a/be/src/exprs/function/function_hamming_distance.cpp +++ b/be/src/exprs/function/function_hamming_distance.cpp @@ -20,87 +20,88 @@ #include "common/status.h" #include "util/simd/vstring_function.h" -#include "vec/columns/column_const.h" -#include "vec/columns/column_nullable.h" -#include "vec/columns/column_string.h" -#include "vec/common/typeid_cast.h" #include "vec/common/string_ref.h" #include "vec/data_types/data_type_number.h" -#include "vec/functions/function.h" +#include "vec/functions/function_totype.h" #include "vec/functions/simple_function_factory.h" namespace doris::vectorized { #include "common/compile_check_begin.h" -class FunctionHammingDistance : public IFunction { -public: +struct NameHammingDistance { static constexpr auto name = "hamming_distance"; +}; - static FunctionPtr create() { return std::make_shared(); } - - String get_name() const override { return name; } - - size_t get_number_of_arguments() const override { return 2; } - - DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { - return std::make_shared(); +template +struct HammingDistanceImpl { + using ResultDataType = DataTypeInt64; + using ResultPaddedPODArray = PaddedPODArray; + + static Status vector_vector(const ColumnString::Chars& ldata, + const ColumnString::Offsets& loffsets, + const ColumnString::Chars& rdata, + const ColumnString::Offsets& roffsets, + ResultPaddedPODArray& res) { + DCHECK_EQ(loffsets.size(), roffsets.size()); + + const size_t size = loffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + RETURN_IF_ERROR(hamming_distance(string_ref_at(ldata, loffsets, i), + string_ref_at(rdata, roffsets, i), res[i], i)); + } + return Status::OK(); } - Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, - uint32_t result, size_t input_rows_count) const override { - const auto& [left_col, left_const] = - unpack_if_const(block.get_by_position(arguments[0]).column); - const auto& [right_col, right_const] = - unpack_if_const(block.get_by_position(arguments[1]).column); - const auto* left_str_col = - check_and_get_column(remove_nullable(left_col).get()); - const auto* right_str_col = - check_and_get_column(remove_nullable(right_col).get()); - if (!left_str_col || !right_str_col) { - return Status::NotSupported("Illegal columns {}, {} of argument of function {}", - left_col->get_name(), right_col->get_name(), get_name()); + static Status vector_scalar(const ColumnString::Chars& ldata, + const ColumnString::Offsets& loffsets, const StringRef& rdata, + ResultPaddedPODArray& res) { + const size_t size = loffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + RETURN_IF_ERROR(hamming_distance(string_ref_at(ldata, loffsets, i), rdata, res[i], i)); } + return Status::OK(); + } - auto res_column = ColumnInt64::create(input_rows_count); - auto& res_data = res_column->get_data(); - - for (size_t i = 0; i < input_rows_count; ++i) { - const StringRef left = left_str_col->get_data_at(left_const ? 0 : i); - const StringRef right = right_str_col->get_data_at(right_const ? 0 : i); - RETURN_IF_ERROR(hamming_distance(left, right, res_data[i], i)); + static Status scalar_vector(const StringRef& ldata, const ColumnString::Chars& rdata, + const ColumnString::Offsets& roffsets, + ResultPaddedPODArray& res) { + const size_t size = roffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + RETURN_IF_ERROR(hamming_distance(ldata, string_ref_at(rdata, roffsets, i), res[i], i)); } - - block.replace_by_position(result, std::move(res_column)); return Status::OK(); } private: + static StringRef string_ref_at(const ColumnString::Chars& data, + const ColumnString::Offsets& offsets, size_t i) { + return StringRef(reinterpret_cast(&data[offsets[i - 1]]), + offsets[i] - offsets[i - 1]); + } + static void utf8_char_offsets(const StringRef& ref, std::vector& offsets) { offsets.clear(); offsets.reserve(ref.size); - const char* data = ref.data; - size_t size = ref.size; size_t i = 0; - while (i < size) { + while (i < ref.size) { offsets.push_back(i); - uint8_t char_len = - doris::get_utf8_byte_length(static_cast(data[i])); - if (i + char_len > size) { - char_len = static_cast(size - i); + uint8_t char_len = doris::get_utf8_byte_length(static_cast(ref.data[i])); + if (i + char_len > ref.size) { + char_len = static_cast(ref.size - i); } i += char_len; } } - static inline bool utf8_char_equal(const StringRef& left, size_t left_off, size_t left_next, - const StringRef& right, size_t right_off, - size_t right_next) { - size_t left_len = left_next - left_off; - size_t right_len = right_next - right_off; - if (left_len != right_len) { - return false; - } - return std::memcmp(left.data + left_off, right.data + right_off, left_len) == 0; + static bool utf8_char_equal(const StringRef& left, size_t left_off, size_t left_next, + const StringRef& right, size_t right_off, size_t right_next) { + const size_t left_len = left_next - left_off; + const size_t right_len = right_next - right_off; + return left_len == right_len && + std::memcmp(left.data + left_off, right.data + right_off, left_len) == 0; } static Status hamming_distance(const StringRef& left, const StringRef& right, Int64& result, @@ -110,6 +111,7 @@ class FunctionHammingDistance : public IFunction { return Status::InvalidArgument( "hamming_distance requires strings of the same length at row {}", row); } + Int64 distance = 0; for (size_t i = 0; i < left.size; ++i) { distance += static_cast(left.data[i] != right.data[i]); @@ -131,18 +133,22 @@ class FunctionHammingDistance : public IFunction { Int64 distance = 0; const size_t len = left_offsets.size(); for (size_t i = 0; i < len; ++i) { - size_t left_off = left_offsets[i]; - size_t left_next = (i + 1 < len) ? left_offsets[i + 1] : left.size; - size_t right_off = right_offsets[i]; - size_t right_next = (i + 1 < len) ? right_offsets[i + 1] : right.size; + const size_t left_off = left_offsets[i]; + const size_t left_next = i + 1 < len ? left_offsets[i + 1] : left.size; + const size_t right_off = right_offsets[i]; + const size_t right_next = i + 1 < len ? right_offsets[i + 1] : right.size; distance += static_cast( !utf8_char_equal(left, left_off, left_next, right, right_off, right_next)); } + result = distance; return Status::OK(); } }; +using FunctionHammingDistance = FunctionBinaryToType; + void register_function_hamming_distance(SimpleFunctionFactory& factory) { factory.register_function(); } diff --git a/be/src/exprs/function/function_levenshtein.cpp b/be/src/exprs/function/function_levenshtein.cpp index b3b8b29ded5153..e1139a04bdcf68 100644 --- a/be/src/exprs/function/function_levenshtein.cpp +++ b/be/src/exprs/function/function_levenshtein.cpp @@ -21,123 +21,116 @@ #include "common/status.h" #include "util/simd/vstring_function.h" -#include "vec/columns/column_const.h" -#include "vec/columns/column_nullable.h" -#include "vec/columns/column_string.h" -#include "vec/common/typeid_cast.h" #include "vec/common/string_ref.h" #include "vec/data_types/data_type_number.h" -#include "vec/functions/function.h" +#include "vec/functions/function_totype.h" #include "vec/functions/simple_function_factory.h" namespace doris::vectorized { #include "common/compile_check_begin.h" -class FunctionLevenshtein : public IFunction { -public: +struct NameLevenshtein { static constexpr auto name = "levenshtein"; +}; - static FunctionPtr create() { return std::make_shared(); } - - String get_name() const override { return name; } +template +struct LevenshteinImpl { + using ResultDataType = DataTypeInt32; + using ResultPaddedPODArray = PaddedPODArray; - size_t get_number_of_arguments() const override { return 2; } + static Status vector_vector(const ColumnString::Chars& ldata, + const ColumnString::Offsets& loffsets, + const ColumnString::Chars& rdata, + const ColumnString::Offsets& roffsets, + ResultPaddedPODArray& res) { + DCHECK_EQ(loffsets.size(), roffsets.size()); - DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { - return std::make_shared(); + const size_t size = loffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + res[i] = levenshtein_distance(string_ref_at(ldata, loffsets, i), + string_ref_at(rdata, roffsets, i)); + } + return Status::OK(); } - Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, - uint32_t result, size_t input_rows_count) const override { - const auto& [left_col, left_const] = - unpack_if_const(block.get_by_position(arguments[0]).column); - const auto& [right_col, right_const] = - unpack_if_const(block.get_by_position(arguments[1]).column); - const auto* left_str_col = - check_and_get_column(remove_nullable(left_col).get()); - const auto* right_str_col = - check_and_get_column(remove_nullable(right_col).get()); - if (!left_str_col || !right_str_col) { - return Status::NotSupported("Illegal columns {}, {} of argument of function {}", - left_col->get_name(), right_col->get_name(), get_name()); + static Status vector_scalar(const ColumnString::Chars& ldata, + const ColumnString::Offsets& loffsets, const StringRef& rdata, + ResultPaddedPODArray& res) { + const size_t size = loffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + res[i] = levenshtein_distance(string_ref_at(ldata, loffsets, i), rdata); } + return Status::OK(); + } - auto res_column = ColumnInt32::create(input_rows_count); - auto& res_data = res_column->get_data(); - - for (size_t i = 0; i < input_rows_count; ++i) { - const StringRef left = left_str_col->get_data_at(left_const ? 0 : i); - const StringRef right = right_str_col->get_data_at(right_const ? 0 : i); - res_data[i] = levenshtein_distance(left, right); + static Status scalar_vector(const StringRef& ldata, const ColumnString::Chars& rdata, + const ColumnString::Offsets& roffsets, + ResultPaddedPODArray& res) { + const size_t size = roffsets.size(); + res.resize(size); + for (size_t i = 0; i < size; ++i) { + res[i] = levenshtein_distance(ldata, string_ref_at(rdata, roffsets, i)); } - - block.replace_by_position(result, std::move(res_column)); return Status::OK(); } private: + static StringRef string_ref_at(const ColumnString::Chars& data, + const ColumnString::Offsets& offsets, size_t i) { + return StringRef(reinterpret_cast(&data[offsets[i - 1]]), + offsets[i] - offsets[i - 1]); + } + static void utf8_char_offsets(const StringRef& ref, std::vector& offsets) { offsets.clear(); offsets.reserve(ref.size); - const char* data = ref.data; - size_t size = ref.size; size_t i = 0; - while (i < size) { + while (i < ref.size) { offsets.push_back(i); - uint8_t char_len = - doris::get_utf8_byte_length(static_cast(data[i])); - if (i + char_len > size) { - char_len = static_cast(size - i); + uint8_t char_len = doris::get_utf8_byte_length(static_cast(ref.data[i])); + if (i + char_len > ref.size) { + char_len = static_cast(ref.size - i); } i += char_len; } } - static inline bool utf8_char_equal(const StringRef& left, size_t left_off, size_t left_next, - const StringRef& right, size_t right_off, - size_t right_next) { - size_t left_len = left_next - left_off; - size_t right_len = right_next - right_off; - if (left_len != right_len) { - return false; - } - return std::memcmp(left.data + left_off, right.data + right_off, left_len) == 0; + static bool utf8_char_equal(const StringRef& left, size_t left_off, size_t left_next, + const StringRef& right, size_t right_off, size_t right_next) { + const size_t left_len = left_next - left_off; + const size_t right_len = right_next - right_off; + return left_len == right_len && + std::memcmp(left.data + left_off, right.data + right_off, left_len) == 0; } - static int levenshtein_distance_ascii(const StringRef& left, const StringRef& right) { - const size_t left_len = left.size; - const size_t right_len = right.size; - if (left_len == 0) { - return static_cast(right_len); - } - if (right_len == 0) { - return static_cast(left_len); - } - + static Int32 levenshtein_distance_ascii(const StringRef& left, const StringRef& right) { const StringRef* left_ref = &left; const StringRef* right_ref = &right; - size_t m = left_len; - size_t n = right_len; + size_t m = left.size; + size_t n = right.size; + if (n > m) { std::swap(left_ref, right_ref); std::swap(m, n); } - std::vector prev(n + 1); - std::vector curr(n + 1); + std::vector prev(n + 1); + std::vector curr(n + 1); for (size_t j = 0; j <= n; ++j) { - prev[j] = static_cast(j); + prev[j] = static_cast(j); } for (size_t i = 1; i <= m; ++i) { - curr[0] = static_cast(i); + curr[0] = static_cast(i); const char left_char = left_ref->data[i - 1]; for (size_t j = 1; j <= n; ++j) { - const int cost = (left_char == right_ref->data[j - 1]) ? 0 : 1; - const int insert_cost = curr[j - 1] + 1; - const int delete_cost = prev[j] + 1; - const int replace_cost = prev[j - 1] + cost; + const Int32 cost = left_char == right_ref->data[j - 1] ? 0 : 1; + const Int32 insert_cost = curr[j - 1] + 1; + const Int32 delete_cost = prev[j] + 1; + const Int32 replace_cost = prev[j - 1] + cost; curr[j] = std::min({insert_cost, delete_cost, replace_cost}); } std::swap(prev, curr); @@ -146,17 +139,16 @@ class FunctionLevenshtein : public IFunction { return prev[n]; } - static int levenshtein_distance(const StringRef& left, const StringRef& right) { + static Int32 levenshtein_distance(const StringRef& left, const StringRef& right) { if (simd::VStringFunctions::is_ascii(left) && simd::VStringFunctions::is_ascii(right)) { return levenshtein_distance_ascii(left, right); } + if (left.size == 0) { - return static_cast( - simd::VStringFunctions::get_char_len(right.data, right.size)); + return static_cast(simd::VStringFunctions::get_char_len(right.data, right.size)); } if (right.size == 0) { - return static_cast( - simd::VStringFunctions::get_char_len(left.data, left.size)); + return static_cast(simd::VStringFunctions::get_char_len(left.data, left.size)); } std::vector left_offsets; @@ -174,36 +166,29 @@ class FunctionLevenshtein : public IFunction { const size_t m = left_offsets.size(); const size_t n = right_offsets.size(); - if (m == 0) { - return static_cast(n); - } - if (n == 0) { - return static_cast(m); - } - - std::vector prev(n + 1); - std::vector curr(n + 1); + std::vector prev(n + 1); + std::vector curr(n + 1); for (size_t j = 0; j <= n; ++j) { - prev[j] = static_cast(j); + prev[j] = static_cast(j); } for (size_t i = 1; i <= m; ++i) { - curr[0] = static_cast(i); - size_t left_off = left_offsets[i - 1]; - size_t left_next = (i < m) ? left_offsets[i] : left_ref->size; + curr[0] = static_cast(i); + const size_t left_off = left_offsets[i - 1]; + const size_t left_next = i < m ? left_offsets[i] : left_ref->size; for (size_t j = 1; j <= n; ++j) { - size_t right_off = right_offsets[j - 1]; - size_t right_next = (j < n) ? right_offsets[j] : right_ref->size; + const size_t right_off = right_offsets[j - 1]; + const size_t right_next = j < n ? right_offsets[j] : right_ref->size; - int cost = utf8_char_equal(*left_ref, left_off, left_next, *right_ref, right_off, - right_next) - ? 0 - : 1; + const Int32 cost = utf8_char_equal(*left_ref, left_off, left_next, *right_ref, + right_off, right_next) + ? 0 + : 1; - int insert_cost = curr[j - 1] + 1; - int delete_cost = prev[j] + 1; - int replace_cost = prev[j - 1] + cost; + const Int32 insert_cost = curr[j - 1] + 1; + const Int32 delete_cost = prev[j] + 1; + const Int32 replace_cost = prev[j - 1] + cost; curr[j] = std::min({insert_cost, delete_cost, replace_cost}); } std::swap(prev, curr); @@ -213,6 +198,9 @@ class FunctionLevenshtein : public IFunction { } }; +using FunctionLevenshtein = + FunctionBinaryToType; + void register_function_levenshtein(SimpleFunctionFactory& factory) { factory.register_function(); } From 2117356d851992051bc9c28dce98f5efadca9c3f Mon Sep 17 00:00:00 2001 From: whisper33z Date: Fri, 27 Feb 2026 22:49:07 +0800 Subject: [PATCH 07/15] [vec][string] refactor levenshtein/hamming_distance with FunctionBinaryToType and extend regression cases Refactor BE implementations to FunctionBinaryToType template style and keep UTF-8/ASCII semantics. Optimize hamming_distance UTF-8 loop branch handling. Add and align query_p0 + nereids_p0 constant/table test cases and expected outputs. --- .../function/function_hamming_distance.cpp | 18 +++++++++++------- be/src/exprs/function/function_levenshtein.cpp | 6 ++---- .../string_functions/test_string_all.out | 12 ++++++++++++ .../string_functions/test_string_all.groovy | 14 ++++++++++++-- 4 files changed, 37 insertions(+), 13 deletions(-) diff --git a/be/src/exprs/function/function_hamming_distance.cpp b/be/src/exprs/function/function_hamming_distance.cpp index 7368ea74951e8b..050c061ed27fbe 100644 --- a/be/src/exprs/function/function_hamming_distance.cpp +++ b/be/src/exprs/function/function_hamming_distance.cpp @@ -40,8 +40,7 @@ struct HammingDistanceImpl { static Status vector_vector(const ColumnString::Chars& ldata, const ColumnString::Offsets& loffsets, const ColumnString::Chars& rdata, - const ColumnString::Offsets& roffsets, - ResultPaddedPODArray& res) { + const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) { DCHECK_EQ(loffsets.size(), roffsets.size()); const size_t size = loffsets.size(); @@ -65,8 +64,7 @@ struct HammingDistanceImpl { } static Status scalar_vector(const StringRef& ldata, const ColumnString::Chars& rdata, - const ColumnString::Offsets& roffsets, - ResultPaddedPODArray& res) { + const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) { const size_t size = roffsets.size(); res.resize(size); for (size_t i = 0; i < size; ++i) { @@ -132,14 +130,20 @@ struct HammingDistanceImpl { Int64 distance = 0; const size_t len = left_offsets.size(); - for (size_t i = 0; i < len; ++i) { + for (size_t i = 0; i + 1 < len; ++i) { const size_t left_off = left_offsets[i]; - const size_t left_next = i + 1 < len ? left_offsets[i + 1] : left.size; + const size_t left_next = left_offsets[i + 1]; const size_t right_off = right_offsets[i]; - const size_t right_next = i + 1 < len ? right_offsets[i + 1] : right.size; + const size_t right_next = right_offsets[i + 1]; distance += static_cast( !utf8_char_equal(left, left_off, left_next, right, right_off, right_next)); } + if (len > 0) { + const size_t left_off = left_offsets[len - 1]; + const size_t right_off = right_offsets[len - 1]; + distance += static_cast( + !utf8_char_equal(left, left_off, left.size, right, right_off, right.size)); + } result = distance; return Status::OK(); diff --git a/be/src/exprs/function/function_levenshtein.cpp b/be/src/exprs/function/function_levenshtein.cpp index e1139a04bdcf68..adcdd18284832e 100644 --- a/be/src/exprs/function/function_levenshtein.cpp +++ b/be/src/exprs/function/function_levenshtein.cpp @@ -41,8 +41,7 @@ struct LevenshteinImpl { static Status vector_vector(const ColumnString::Chars& ldata, const ColumnString::Offsets& loffsets, const ColumnString::Chars& rdata, - const ColumnString::Offsets& roffsets, - ResultPaddedPODArray& res) { + const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) { DCHECK_EQ(loffsets.size(), roffsets.size()); const size_t size = loffsets.size(); @@ -66,8 +65,7 @@ struct LevenshteinImpl { } static Status scalar_vector(const StringRef& ldata, const ColumnString::Chars& rdata, - const ColumnString::Offsets& roffsets, - ResultPaddedPODArray& res) { + const ColumnString::Offsets& roffsets, ResultPaddedPODArray& res) { const size_t size = roffsets.size(); res.resize(size); for (size_t i = 0; i < size; ++i) { diff --git a/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out b/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out index 64bbf02b63c100..d76de3fb6195aa 100644 --- a/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out +++ b/regression-test/data/query_p0/sql_functions/string_functions/test_string_all.out @@ -971,11 +971,17 @@ R163 R163 -- !levenshtein_332 -- 0 3 3 \N \N +-- !levenshtein_333 -- +2 1 1 + -- !levenshtein_tbl -- 1 3 2 0 3 1 4 \N +5 1 +6 2 +7 3 -- !hamming_distance_333 -- 0 0 1 1 @@ -983,11 +989,17 @@ R163 R163 -- !hamming_distance_334 -- 0 \N \N +-- !hamming_distance_335 -- +4 1 2 + -- !hamming_distance_tbl -- 1 0 2 1 3 1 4 \N +5 4 +6 1 +7 2 -- !space_333 -- diff --git a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy index ce70ceed483182..5417511317c265 100644 --- a/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy +++ b/regression-test/suites/query_p0/sql_functions/string_functions/test_string_all.groovy @@ -758,6 +758,8 @@ suite("string_functions_all") { testFoldConst("SELECT levenshtein('', ''), levenshtein('kitten', 'sitting'), levenshtein('flaw', 'lawn'), levenshtein('你好', '你们'), levenshtein('数据库', '数据');") qt_levenshtein_332 "SELECT levenshtein('abc', 'abc'), levenshtein('abc', ''), levenshtein('', 'abc'), levenshtein(NULL, 'abc'), levenshtein('abc', NULL);" testFoldConst("SELECT levenshtein('abc', 'abc'), levenshtein('abc', ''), levenshtein('', 'abc'), levenshtein(NULL, 'abc'), levenshtein('abc', NULL);") + qt_levenshtein_333 "SELECT levenshtein('abcd', 'abdc'), levenshtein('你好呀', '你好'), levenshtein('a你b', 'a们b');" + testFoldConst("SELECT levenshtein('abcd', 'abdc'), levenshtein('你好呀', '你好'), levenshtein('a你b', 'a们b');") sql """DROP TABLE IF EXISTS string_distance_lv_test""" sql """ CREATE TABLE IF NOT EXISTS string_distance_lv_test ( @@ -773,7 +775,10 @@ suite("string_functions_all") { (1, 'kitten', 'sitting'), (2, 'abc', 'abc'), (3, '数据库', '数据'), - (4, null, 'abc') + (4, null, 'abc'), + (5, '你好呀', '你好'), + (6, 'abcd', 'abdc'), + (7, '', '数据库') """ qt_levenshtein_tbl "SELECT id, levenshtein(s1, s2) FROM string_distance_lv_test ORDER BY id" @@ -782,6 +787,8 @@ suite("string_functions_all") { testFoldConst("SELECT hamming_distance('', ''), hamming_distance('abc', 'abc'), hamming_distance('abc', 'abd'), hamming_distance('你好', '你们');") qt_hamming_distance_334 "SELECT hamming_distance('abc', 'abc'), hamming_distance(NULL, 'abc'), hamming_distance('abc', NULL);" testFoldConst("SELECT hamming_distance('abc', 'abc'), hamming_distance(NULL, 'abc'), hamming_distance('abc', NULL);") + qt_hamming_distance_335 "SELECT hamming_distance('abcd', 'wxyz'), hamming_distance('你好吗', '你们吗'), hamming_distance('数据库', '数库据');" + testFoldConst("SELECT hamming_distance('abcd', 'wxyz'), hamming_distance('你好吗', '你们吗'), hamming_distance('数据库', '数库据');") sql """DROP TABLE IF EXISTS string_distance_hd_test""" sql """ CREATE TABLE IF NOT EXISTS string_distance_hd_test ( @@ -797,7 +804,10 @@ suite("string_functions_all") { (1, 'abc', 'abc'), (2, 'abc', 'abd'), (3, '你好', '你们'), - (4, null, 'abc') + (4, null, 'abc'), + (5, 'abcd', 'wxyz'), + (6, '你好吗', '你们吗'), + (7, '数据库', '数库据') """ qt_hamming_distance_tbl "SELECT id, hamming_distance(s1, s2) FROM string_distance_hd_test ORDER BY id" From 3c2de98a06e8eb5eab371292ecb55ec111fc11a6 Mon Sep 17 00:00:00 2001 From: whisper33z Date: Sun, 1 Mar 2026 12:36:41 +0800 Subject: [PATCH 08/15] [fix] harden string distance helpers for bounds safety --- .../function/function_hamming_distance.cpp | 18 ++++++++++++++++-- be/src/exprs/function/function_levenshtein.cpp | 18 ++++++++++++++++-- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/be/src/exprs/function/function_hamming_distance.cpp b/be/src/exprs/function/function_hamming_distance.cpp index 050c061ed27fbe..04d29764b6b8ce 100644 --- a/be/src/exprs/function/function_hamming_distance.cpp +++ b/be/src/exprs/function/function_hamming_distance.cpp @@ -76,8 +76,19 @@ struct HammingDistanceImpl { private: static StringRef string_ref_at(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, size_t i) { - return StringRef(reinterpret_cast(&data[offsets[i - 1]]), - offsets[i] - offsets[i - 1]); + DCHECK_LT(i, offsets.size()); + const size_t begin = (i == 0) ? 0 : offsets[i - 1]; + const size_t end = offsets[i]; + if (end <= begin || end > data.size()) { + return StringRef("", 0); + } + + size_t str_size = end - begin; + // ColumnString offsets usually include trailing '\0' for each row. + if (data[end - 1] == '\0') { + --str_size; + } + return StringRef(reinterpret_cast(data.data() + begin), str_size); } static void utf8_char_offsets(const StringRef& ref, std::vector& offsets) { @@ -87,6 +98,9 @@ struct HammingDistanceImpl { while (i < ref.size) { offsets.push_back(i); uint8_t char_len = doris::get_utf8_byte_length(static_cast(ref.data[i])); + if (char_len == 0) { + char_len = 1; + } if (i + char_len > ref.size) { char_len = static_cast(ref.size - i); } diff --git a/be/src/exprs/function/function_levenshtein.cpp b/be/src/exprs/function/function_levenshtein.cpp index adcdd18284832e..62531c025f81f2 100644 --- a/be/src/exprs/function/function_levenshtein.cpp +++ b/be/src/exprs/function/function_levenshtein.cpp @@ -77,8 +77,19 @@ struct LevenshteinImpl { private: static StringRef string_ref_at(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, size_t i) { - return StringRef(reinterpret_cast(&data[offsets[i - 1]]), - offsets[i] - offsets[i - 1]); + DCHECK_LT(i, offsets.size()); + const size_t begin = (i == 0) ? 0 : offsets[i - 1]; + const size_t end = offsets[i]; + if (end <= begin || end > data.size()) { + return StringRef("", 0); + } + + size_t str_size = end - begin; + // ColumnString offsets usually include trailing '\0' for each row. + if (data[end - 1] == '\0') { + --str_size; + } + return StringRef(reinterpret_cast(data.data() + begin), str_size); } static void utf8_char_offsets(const StringRef& ref, std::vector& offsets) { @@ -88,6 +99,9 @@ struct LevenshteinImpl { while (i < ref.size) { offsets.push_back(i); uint8_t char_len = doris::get_utf8_byte_length(static_cast(ref.data[i])); + if (char_len == 0) { + char_len = 1; + } if (i + char_len > ref.size) { char_len = static_cast(ref.size - i); } From 2c5fffc7c730c4afcc0b025089073ac70cac3e5c Mon Sep 17 00:00:00 2001 From: whisper33z Date: Sun, 1 Mar 2026 12:43:17 +0800 Subject: [PATCH 09/15] [fix] align string_ref_at with ColumnString offset semantics --- be/src/exprs/function/function_hamming_distance.cpp | 8 +------- be/src/exprs/function/function_levenshtein.cpp | 8 +------- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/be/src/exprs/function/function_hamming_distance.cpp b/be/src/exprs/function/function_hamming_distance.cpp index 04d29764b6b8ce..eb763727e8cf1f 100644 --- a/be/src/exprs/function/function_hamming_distance.cpp +++ b/be/src/exprs/function/function_hamming_distance.cpp @@ -82,13 +82,7 @@ struct HammingDistanceImpl { if (end <= begin || end > data.size()) { return StringRef("", 0); } - - size_t str_size = end - begin; - // ColumnString offsets usually include trailing '\0' for each row. - if (data[end - 1] == '\0') { - --str_size; - } - return StringRef(reinterpret_cast(data.data() + begin), str_size); + return StringRef(reinterpret_cast(data.data() + begin), end - begin); } static void utf8_char_offsets(const StringRef& ref, std::vector& offsets) { diff --git a/be/src/exprs/function/function_levenshtein.cpp b/be/src/exprs/function/function_levenshtein.cpp index 62531c025f81f2..a5a853d8de5f1c 100644 --- a/be/src/exprs/function/function_levenshtein.cpp +++ b/be/src/exprs/function/function_levenshtein.cpp @@ -83,13 +83,7 @@ struct LevenshteinImpl { if (end <= begin || end > data.size()) { return StringRef("", 0); } - - size_t str_size = end - begin; - // ColumnString offsets usually include trailing '\0' for each row. - if (data[end - 1] == '\0') { - --str_size; - } - return StringRef(reinterpret_cast(data.data() + begin), str_size); + return StringRef(reinterpret_cast(data.data() + begin), end - begin); } static void utf8_char_offsets(const StringRef& ref, std::vector& offsets) { From 9c59bc83e85f6e96980baf20a3ab940e927a053e Mon Sep 17 00:00:00 2001 From: whisper33z Date: Sun, 1 Mar 2026 15:11:26 +0800 Subject: [PATCH 10/15] [fix](vec): align hamming_distance null semantics and update string function regression cases --- .../function/function_hamming_distance.cpp | 131 +++++++++++++++++- .../exprs/function/function_levenshtein.cpp | 7 +- 2 files changed, 130 insertions(+), 8 deletions(-) diff --git a/be/src/exprs/function/function_hamming_distance.cpp b/be/src/exprs/function/function_hamming_distance.cpp index eb763727e8cf1f..74fb45cf486436 100644 --- a/be/src/exprs/function/function_hamming_distance.cpp +++ b/be/src/exprs/function/function_hamming_distance.cpp @@ -15,11 +15,14 @@ // specific language governing permissions and limitations // under the License. +#include #include #include #include "common/status.h" #include "util/simd/vstring_function.h" +#include "vec/columns/column_nullable.h" +#include "vec/columns/column_string.h" #include "vec/common/string_ref.h" #include "vec/data_types/data_type_number.h" #include "vec/functions/function_totype.h" @@ -46,8 +49,8 @@ struct HammingDistanceImpl { const size_t size = loffsets.size(); res.resize(size); for (size_t i = 0; i < size; ++i) { - RETURN_IF_ERROR(hamming_distance(string_ref_at(ldata, loffsets, i), - string_ref_at(rdata, roffsets, i), res[i], i)); + RETURN_IF_ERROR(one_row(string_ref_at(ldata, loffsets, i), + string_ref_at(rdata, roffsets, i), res[i], i)); } return Status::OK(); } @@ -58,7 +61,7 @@ struct HammingDistanceImpl { const size_t size = loffsets.size(); res.resize(size); for (size_t i = 0; i < size; ++i) { - RETURN_IF_ERROR(hamming_distance(string_ref_at(ldata, loffsets, i), rdata, res[i], i)); + RETURN_IF_ERROR(one_row(string_ref_at(ldata, loffsets, i), rdata, res[i], i)); } return Status::OK(); } @@ -68,11 +71,16 @@ struct HammingDistanceImpl { const size_t size = roffsets.size(); res.resize(size); for (size_t i = 0; i < size; ++i) { - RETURN_IF_ERROR(hamming_distance(ldata, string_ref_at(rdata, roffsets, i), res[i], i)); + RETURN_IF_ERROR(one_row(ldata, string_ref_at(rdata, roffsets, i), res[i], i)); } return Status::OK(); } + static Status one_row(const StringRef& left, const StringRef& right, Int64& result, + size_t row) { + return hamming_distance(left, right, result, row); + } + private: static StringRef string_ref_at(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, size_t i) { @@ -82,7 +90,12 @@ struct HammingDistanceImpl { if (end <= begin || end > data.size()) { return StringRef("", 0); } - return StringRef(reinterpret_cast(data.data() + begin), end - begin); + + size_t str_size = end - begin; + if (str_size > 0 && data[end - 1] == '\0') { + --str_size; + } + return StringRef(reinterpret_cast(data.data() + begin), str_size); } static void utf8_char_offsets(const StringRef& ref, std::vector& offsets) { @@ -158,8 +171,112 @@ struct HammingDistanceImpl { } }; -using FunctionHammingDistance = FunctionBinaryToType; +template