From 9f8cfd36e70eb87b8f3e214a5f3b1d562a102757 Mon Sep 17 00:00:00 2001 From: ntjohnson1 <24689722+ntjohnson1@users.noreply.github.com> Date: Thu, 5 Mar 2026 14:58:19 -0500 Subject: [PATCH] Add docstring examples for Scalar string functions Add example usage to docstrings for Scalar string functions to improve documentation. Co-Authored-By: Claude Opus 4.6 --- python/datafusion/functions.py | 379 ++++++++++++++++++++++++++++++--- 1 file changed, 353 insertions(+), 26 deletions(-) diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index fd116254b..a64d19e01 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -521,7 +521,16 @@ def acosh(arg: Expr) -> Expr: def ascii(arg: Expr) -> Expr: - """Returns the numeric code of the first character of the argument.""" + """Returns the numeric code of the first character of the argument. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["a","b","c"]}) + >>> ascii_df = df.select(dfn.functions.ascii(dfn.col("a")).alias("ascii")) + >>> ascii_df.collect_column("ascii")[0].as_py() + 97 + """ return Expr(f.ascii(arg.expr)) @@ -597,12 +606,30 @@ def atan2(y: Expr, x: Expr) -> Expr: def bit_length(arg: Expr) -> Expr: - """Returns the number of bits in the string argument.""" + """Returns the number of bits in the string argument. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["a","b","c"]}) + >>> bit_df = df.select(dfn.functions.bit_length(dfn.col("a")).alias("bit_len")) + >>> bit_df.collect_column("bit_len")[0].as_py() + 8 + """ return Expr(f.bit_length(arg.expr)) def btrim(arg: Expr) -> Expr: - """Removes all characters, spaces by default, from both sides of a string.""" + """Removes all characters, spaces by default, from both sides of a string. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [" a "]}) + >>> trim_df = df.select(dfn.functions.btrim(dfn.col("a")).alias("trimmed")) + >>> trim_df.collect_column("trimmed")[0].as_py() + 'a' + """ return Expr(f.btrim(arg.expr)) @@ -617,22 +644,59 @@ def ceil(arg: Expr) -> Expr: def character_length(arg: Expr) -> Expr: - """Returns the number of characters in the argument.""" + """Returns the number of characters in the argument. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["abc","b","c"]}) + >>> char_len_df = df.select( + ... dfn.functions.character_length(dfn.col("a")).alias("char_len")) + >>> char_len_df.collect_column("char_len")[0].as_py() + 3 + """ return Expr(f.character_length(arg.expr)) def length(string: Expr) -> Expr: - """The number of characters in the ``string``.""" + """The number of characters in the ``string``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.length(dfn.col("a")).alias("len")) + >>> result.collect_column("len")[0].as_py() + 5 + """ return Expr(f.length(string.expr)) def char_length(string: Expr) -> Expr: - """The number of characters in the ``string``.""" + """The number of characters in the ``string``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.char_length(dfn.col("a")).alias("len")) + >>> result.collect_column("len")[0].as_py() + 5 + """ return Expr(f.char_length(string.expr)) def chr(arg: Expr) -> Expr: - """Converts the Unicode code point to a UTF8 character.""" + """Converts the Unicode code point to a UTF8 character. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [65]}) + >>> result = df.select(dfn.functions.chr(dfn.col("a")).alias("chr")) + >>> result.collect_column("chr")[0].as_py() + 'A' + """ return Expr(f.chr(arg.expr)) @@ -706,7 +770,17 @@ def degrees(arg: Expr) -> Expr: def ends_with(arg: Expr, suffix: Expr) -> Expr: - """Returns true if the ``string`` ends with the ``suffix``, false otherwise.""" + """Returns true if the ``string`` ends with the ``suffix``, false otherwise. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["abc","b","c"]}) + >>> ends_with_df = df.select( + ... dfn.functions.ends_with(dfn.col("a"), dfn.lit("c")).alias("ends_with")) + >>> ends_with_df.collect_column("ends_with")[0].as_py() + True + """ return Expr(f.ends_with(arg.expr, suffix.expr)) @@ -727,6 +801,15 @@ def find_in_set(string: Expr, string_list: Expr) -> Expr: ``string_list`` consisting of N substrings. The string list is a string composed of substrings separated by ``,`` characters. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["b"]}) + >>> result = df.select( + ... dfn.functions.find_in_set(dfn.col("a"), dfn.lit("a,b,c")).alias("pos")) + >>> result.collect_column("pos")[0].as_py() + 2 """ return Expr(f.find_in_set(string.expr, string_list.expr)) @@ -746,6 +829,14 @@ def initcap(string: Expr) -> Expr: Converts the first letter of each word in ``string`` to uppercase and the remaining characters to lowercase. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["the cat"]}) + >>> cap_df = df.select(dfn.functions.initcap(dfn.col("a")).alias("cap")) + >>> cap_df.collect_column("cap")[0].as_py() + 'The Cat' """ return Expr(f.initcap(string.expr)) @@ -754,6 +845,15 @@ def instr(string: Expr, substring: Expr) -> Expr: """Finds the position from where the ``substring`` matches the ``string``. This is an alias for :py:func:`strpos`. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello world"]}) + >>> result = df.select( + ... dfn.functions.instr(dfn.col("a"), dfn.lit("world")).alias("pos")) + >>> result.collect_column("pos")[0].as_py() + 7 """ return strpos(string, substring) @@ -769,12 +869,31 @@ def lcm(x: Expr, y: Expr) -> Expr: def left(string: Expr, n: Expr) -> Expr: - """Returns the first ``n`` characters in the ``string``.""" + """Returns the first ``n`` characters in the ``string``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["the cat"]}) + >>> left_df = df.select(dfn.functions.left(dfn.col("a"), dfn.lit(3)).alias("left")) + >>> left_df.collect_column("left")[0].as_py() + 'the' + """ return Expr(f.left(string.expr, n.expr)) def levenshtein(string1: Expr, string2: Expr) -> Expr: - """Returns the Levenshtein distance between the two given strings.""" + """Returns the Levenshtein distance between the two given strings. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["kitten"]}) + >>> result = df.select( + ... dfn.functions.levenshtein(dfn.col("a"), dfn.lit("sitting")).alias("d")) + >>> result.collect_column("d")[0].as_py() + 3 + """ return Expr(f.levenshtein(string1.expr, string2.expr)) @@ -799,7 +918,16 @@ def log2(arg: Expr) -> Expr: def lower(arg: Expr) -> Expr: - """Converts a string to lowercase.""" + """Converts a string to lowercase. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["THE CaT"]}) + >>> lower_df = df.select(dfn.functions.lower(dfn.col("a")).alias("lower")) + >>> lower_df.collect_column("lower")[0].as_py() + 'the cat' + """ return Expr(f.lower(arg.expr)) @@ -809,13 +937,32 @@ def lpad(string: Expr, count: Expr, characters: Expr | None = None) -> Expr: Extends the string to length length by prepending the characters fill (a space by default). If the string is already longer than length then it is truncated (on the right). + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["the cat", "a hat"]}) + >>> lpad_df = df.select(dfn.functions.lpad(dfn.col("a"), dfn.lit(6)).alias("lpad")) + >>> lpad_df.collect_column("lpad")[0].as_py() + 'the ca' + >>> lpad_df.collect_column("lpad")[1].as_py() + ' a hat' """ characters = characters if characters is not None else Expr.literal(" ") return Expr(f.lpad(string.expr, count.expr, characters.expr)) def ltrim(arg: Expr) -> Expr: - """Removes all characters, spaces by default, from the beginning of a string.""" + """Removes all characters, spaces by default, from the beginning of a string. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [" a "]}) + >>> trim_df = df.select(dfn.functions.ltrim(dfn.col("a")).alias("trimmed")) + >>> trim_df.collect_column("trimmed")[0].as_py() + 'a ' + """ return Expr(f.ltrim(arg.expr)) @@ -835,7 +982,16 @@ def nvl(x: Expr, y: Expr) -> Expr: def octet_length(arg: Expr) -> Expr: - """Returns the number of bytes of a string.""" + """Returns the number of bytes of a string. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.octet_length(dfn.col("a")).alias("len")) + >>> result.collect_column("len")[0].as_py() + 5 + """ return Expr(f.octet_length(arg.expr)) @@ -846,6 +1002,16 @@ def overlay( Replace the substring of string that starts at the ``start``'th character and extends for ``length`` characters with new substring. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["abcdef"]}) + >>> result = df.select( + ... dfn.functions.overlay(dfn.col("a"), dfn.lit("XY"), dfn.lit(3), + ... dfn.lit(2)).alias("o")) + >>> result.collect_column("o")[0].as_py() + 'abXYef' """ if length is None: return Expr(f.overlay(string.expr, substring.expr, start.expr)) @@ -861,6 +1027,15 @@ def position(string: Expr, substring: Expr) -> Expr: """Finds the position from where the ``substring`` matches the ``string``. This is an alias for :py:func:`strpos`. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.position(dfn.col("a"), dfn.lit("llo")).alias("pos")) + >>> result.collect_column("pos")[0].as_py() + 3 """ return strpos(string, substring) @@ -983,22 +1158,60 @@ def regexp_instr( def repeat(string: Expr, n: Expr) -> Expr: - """Repeats the ``string`` to ``n`` times.""" + """Repeats the ``string`` to ``n`` times. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["ha"]}) + >>> result = df.select(dfn.functions.repeat(dfn.col("a"), dfn.lit(3)).alias("r")) + >>> result.collect_column("r")[0].as_py() + 'hahaha' + """ return Expr(f.repeat(string.expr, n.expr)) def replace(string: Expr, from_val: Expr, to_val: Expr) -> Expr: - """Replaces all occurrences of ``from_val`` with ``to_val`` in the ``string``.""" + """Replaces all occurrences of ``from_val`` with ``to_val`` in the ``string``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello world"]}) + >>> result = df.select( + ... dfn.functions.replace(dfn.col("a"), dfn.lit("world"), + ... dfn.lit("there")).alias("r")) + >>> result.collect_column("r")[0].as_py() + 'hello there' + """ return Expr(f.replace(string.expr, from_val.expr, to_val.expr)) def reverse(arg: Expr) -> Expr: - """Reverse the string argument.""" + """Reverse the string argument. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.reverse(dfn.col("a")).alias("r")) + >>> result.collect_column("r")[0].as_py() + 'olleh' + """ return Expr(f.reverse(arg.expr)) def right(string: Expr, n: Expr) -> Expr: - """Returns the last ``n`` characters in the ``string``.""" + """Returns the last ``n`` characters in the ``string``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.right(dfn.col("a"), dfn.lit(3)).alias("r")) + >>> result.collect_column("r")[0].as_py() + 'llo' + """ return Expr(f.right(string.expr, n.expr)) @@ -1019,13 +1232,31 @@ def rpad(string: Expr, count: Expr, characters: Expr | None = None) -> Expr: Extends the string to length length by appending the characters fill (a space by default). If the string is already longer than length then it is truncated. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hi"]}) + >>> result = df.select( + ... dfn.functions.rpad(dfn.col("a"), dfn.lit(5), dfn.lit("!")).alias("r")) + >>> result.collect_column("r")[0].as_py() + 'hi!!!' """ characters = characters if characters is not None else Expr.literal(" ") return Expr(f.rpad(string.expr, count.expr, characters.expr)) def rtrim(arg: Expr) -> Expr: - """Removes all characters, spaces by default, from the end of a string.""" + """Removes all characters, spaces by default, from the end of a string. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [" a "]}) + >>> trim_df = df.select(dfn.functions.rtrim(dfn.col("a")).alias("trimmed")) + >>> trim_df.collect_column("trimmed")[0].as_py() + ' a' + """ return Expr(f.rtrim(arg.expr)) @@ -1087,6 +1318,15 @@ def split_part(string: Expr, delimiter: Expr, index: Expr) -> Expr: Splits a string based on a delimiter and picks out the desired field based on the index. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["a,b,c"]}) + >>> result = df.select( + ... dfn.functions.split_part(dfn.col("a"), dfn.lit(","), dfn.lit(2)).alias("s")) + >>> result.collect_column("s")[0].as_py() + 'b' """ return Expr(f.split_part(string.expr, delimiter.expr, index.expr)) @@ -1097,17 +1337,46 @@ def sqrt(arg: Expr) -> Expr: def starts_with(string: Expr, prefix: Expr) -> Expr: - """Returns true if string starts with prefix.""" + """Returns true if string starts with prefix. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello_from_datafusion"]}) + >>> result = df.select( + ... dfn.functions.starts_with(dfn.col("a"), dfn.lit("hello")).alias("sw")) + >>> result.collect_column("sw")[0].as_py() + True + """ return Expr(f.starts_with(string.expr, prefix.expr)) def strpos(string: Expr, substring: Expr) -> Expr: - """Finds the position from where the ``substring`` matches the ``string``.""" + """Finds the position from where the ``substring`` matches the ``string``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.strpos(dfn.col("a"), dfn.lit("llo")).alias("pos")) + >>> result.collect_column("pos")[0].as_py() + 3 + """ return Expr(f.strpos(string.expr, substring.expr)) def substr(string: Expr, position: Expr) -> Expr: - """Substring from the ``position`` to the end.""" + """Substring from the ``position`` to the end. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.substr(dfn.col("a"), dfn.lit(3)).alias("s")) + >>> result.collect_column("s")[0].as_py() + 'llo' + """ return Expr(f.substr(string.expr, position.expr)) @@ -1116,12 +1385,32 @@ def substr_index(string: Expr, delimiter: Expr, count: Expr) -> Expr: The return will be the ``string`` from before ``count`` occurrences of ``delimiter``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["a.b.c"]}) + >>> result = df.select( + ... dfn.functions.substr_index(dfn.col("a"), dfn.lit("."), + ... dfn.lit(2)).alias("s")) + >>> result.collect_column("s")[0].as_py() + 'a.b' """ return Expr(f.substr_index(string.expr, delimiter.expr, count.expr)) def substring(string: Expr, position: Expr, length: Expr) -> Expr: - """Substring from the ``position`` with ``length`` characters.""" + """Substring from the ``position`` with ``length`` characters. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello world"]}) + >>> result = df.select( + ... dfn.functions.substring(dfn.col("a"), dfn.lit(1), dfn.lit(5)).alias("s")) + >>> result.collect_column("s")[0].as_py() + 'hello' + """ return Expr(f.substring(string.expr, position.expr, length.expr)) @@ -1154,7 +1443,16 @@ def tanh(arg: Expr) -> Expr: def to_hex(arg: Expr) -> Expr: - """Converts an integer to a hexadecimal string.""" + """Converts an integer to a hexadecimal string. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [255]}) + >>> result = df.select(dfn.functions.to_hex(dfn.col("a")).alias("hex")) + >>> result.collect_column("hex")[0].as_py() + 'ff' + """ return Expr(f.to_hex(arg.expr)) @@ -1321,12 +1619,32 @@ def make_date(year: Expr, month: Expr, day: Expr) -> Expr: def translate(string: Expr, from_val: Expr, to_val: Expr) -> Expr: - """Replaces the characters in ``from_val`` with the counterpart in ``to_val``.""" + """Replaces the characters in ``from_val`` with the counterpart in ``to_val``. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select( + ... dfn.functions.translate(dfn.col("a"), dfn.lit("helo"), + ... dfn.lit("HELO")).alias("t")) + >>> result.collect_column("t")[0].as_py() + 'HELLO' + """ return Expr(f.translate(string.expr, from_val.expr, to_val.expr)) def trim(arg: Expr) -> Expr: - """Removes all characters, spaces by default, from both sides of a string.""" + """Removes all characters, spaces by default, from both sides of a string. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": [" hello "]}) + >>> result = df.select(dfn.functions.trim(dfn.col("a")).alias("t")) + >>> result.collect_column("t")[0].as_py() + 'hello' + """ return Expr(f.trim(arg.expr)) @@ -1338,7 +1656,16 @@ def trunc(num: Expr, precision: Expr | None = None) -> Expr: def upper(arg: Expr) -> Expr: - """Converts a string to uppercase.""" + """Converts a string to uppercase. + + Examples: + --------- + >>> ctx = dfn.SessionContext() + >>> df = ctx.from_pydict({"a": ["hello"]}) + >>> result = df.select(dfn.functions.upper(dfn.col("a")).alias("u")) + >>> result.collect_column("u")[0].as_py() + 'HELLO' + """ return Expr(f.upper(arg.expr))