From 30c72a1640d368d192c79a0ff16b12f0a95c4af9 Mon Sep 17 00:00:00 2001 From: Dom Delnano Date: Fri, 11 Oct 2024 14:56:54 -0700 Subject: [PATCH] Add split UDF func to ease certain string handling (#2039) Summary: Add split UDF func to ease certain string handling This is functionality that I'm planning to use for my upcoming Kubecon demo/talk. For this use case, I want to access the individual fields of the XFCC header just like the example in the UDF docstring. Relevant Issues: N/A Type of change: /kind feature Test Plan: New tests pass Changelog Message: Added `px.split` function to support parsing strings that contain delimiters --------- Signed-off-by: Dom Del Nano --- src/carnot/funcs/builtins/json_ops.cc | 1 + src/carnot/funcs/builtins/json_ops.h | 36 ++++++++++++++++++++++ src/carnot/funcs/builtins/json_ops_test.cc | 10 ++++++ 3 files changed, 47 insertions(+) diff --git a/src/carnot/funcs/builtins/json_ops.cc b/src/carnot/funcs/builtins/json_ops.cc index acb8e8c2ffd..32083255c0b 100644 --- a/src/carnot/funcs/builtins/json_ops.cc +++ b/src/carnot/funcs/builtins/json_ops.cc @@ -31,6 +31,7 @@ void RegisterJSONOpsOrDie(udf::Registry* registry) { registry->RegisterOrDie("pluck_int64"); registry->RegisterOrDie("pluck_float64"); registry->RegisterOrDie("pluck_array"); + registry->RegisterOrDie("split"); // Up to 8 script args are supported for the _script_reference UDF, due to the lack of support for // variadic UDF arguments in the UDF registry today. We should clean this up if/when variadic UDF diff --git a/src/carnot/funcs/builtins/json_ops.h b/src/carnot/funcs/builtins/json_ops.h index 6e04fac1728..c04fc2ad75a 100644 --- a/src/carnot/funcs/builtins/json_ops.h +++ b/src/carnot/funcs/builtins/json_ops.h @@ -228,6 +228,42 @@ class PluckArrayUDF : public udf::ScalarUDF { } }; +class SplitUDF : public udf::ScalarUDF { + public: + StringValue Exec(FunctionContext*, StringValue in, StringValue delimiter) { + rapidjson::StringBuffer sb; + rapidjson::Writer writer(sb); + writer.StartArray(); + + for (absl::string_view part : absl::StrSplit(in.data(), delimiter.data())) { + writer.String(part.data(), part.size()); + } + + writer.EndArray(); + return sb.GetString(); + } + + static udf::ScalarUDFDocBuilder Doc() { + return udf::ScalarUDFDocBuilder( + "Splits a string by a delimiter and a returns JSON encoded array of strings.") + .Details( + "This function splits a string by a delimiter and returns a JSON encoded array of " + "strings. The function is useful for splitting strings and then passing the result to " + "px.pluck_array in order to access individual values of a delimited string.") + .Example(R"doc( + | df = px.DataFrame('http_events', start_time='-5m') + | # Returns By=http://frontend.px.dev;URI=http://testclient.px.dev + | df.xfcc_hdr = px.pluck(df.req_headers, 'X-Forwarded-Client-Cert') + | df.xfcc_parts = px.split(df.xfcc_hdr, ';') + | df.by = px.pluck_array(df.xfcc_hdr, 0) # Returns "By=http://frontend.px.dev" + | df.uri = px.pluck_array(df.xfcc_hdr, 1) # Returns "URI=http://testclient.px.dev" + )doc") + .Arg("input_str", "The string to split.") + .Arg("delimiter", "The string value to split the input string.") + .Returns("A JSON encoded array of the split strings."); + } +}; + /** DocString intentionally omitted, this is a non-public function. This function creates a custom deep link by creating a "script reference" from a label, diff --git a/src/carnot/funcs/builtins/json_ops_test.cc b/src/carnot/funcs/builtins/json_ops_test.cc index 56aa7bd0299..ae175b4a6ab 100644 --- a/src/carnot/funcs/builtins/json_ops_test.cc +++ b/src/carnot/funcs/builtins/json_ops_test.cc @@ -111,6 +111,16 @@ TEST(JSONOps, PluckArrayUDF_index_out_of_bound) { udf_tester.ForInput(kTestJSONArray, 3).Expect(""); } +TEST(JSONOps, SplitUDF_with_present_delimiter) { + auto udf_tester = udf::UDFTester(); + udf_tester.ForInput("foo,bar,baz", ",").Expect(R"(["foo","bar","baz"])"); +} + +TEST(JSONOps, SplitUDF_with_missing_delimiter) { + auto udf_tester = udf::UDFTester(); + udf_tester.ForInput("foo,bar,baz", ";").Expect(R"(["foo,bar,baz"])"); +} + TEST(JSONOps, ScriptReferenceUDF_no_args) { auto udf_tester = udf::UDFTester>(); auto res = udf_tester.ForInput("text", "px/script").Result();