diff --git a/src/carnot/funcs/builtins/json_ops.cc b/src/carnot/funcs/builtins/json_ops.cc index acb8e8c2ffd..32083255c0b 100644 --- a/src/carnot/funcs/builtins/json_ops.cc +++ b/src/carnot/funcs/builtins/json_ops.cc @@ -31,6 +31,7 @@ void RegisterJSONOpsOrDie(udf::Registry* registry) { registry->RegisterOrDie("pluck_int64"); registry->RegisterOrDie("pluck_float64"); registry->RegisterOrDie("pluck_array"); + registry->RegisterOrDie("split"); // Up to 8 script args are supported for the _script_reference UDF, due to the lack of support for // variadic UDF arguments in the UDF registry today. We should clean this up if/when variadic UDF diff --git a/src/carnot/funcs/builtins/json_ops.h b/src/carnot/funcs/builtins/json_ops.h index 6e04fac1728..0a6c87ad376 100644 --- a/src/carnot/funcs/builtins/json_ops.h +++ b/src/carnot/funcs/builtins/json_ops.h @@ -228,6 +228,49 @@ class PluckArrayUDF : public udf::ScalarUDF { } }; +class SplitUDF : public udf::ScalarUDF { + public: + StringValue Exec(FunctionContext*, StringValue in, StringValue delimiter) { + rapidjson::StringBuffer sb; + rapidjson::Writer writer(sb); + writer.StartArray(); + + std::string_view s(in.data(), in.size()); + size_t idx = 0; + while (idx < s.size()) { + auto next_idx = s.find(delimiter.data(), idx); + if (next_idx == std::string::npos) { + if (idx > 0) { + writer.String(s.substr(idx).data(), s.substr(idx).size()); + } + break; + } + writer.String(s.substr(idx, next_idx - idx).data(), next_idx - idx); + idx = next_idx + delimiter.size(); + } + writer.EndArray(); + return sb.GetString(); + } + static udf::ScalarUDFDocBuilder Doc() { + return udf::ScalarUDFDocBuilder( + "Splits a string by a delimiter and a returns JSON encoded array of strings.") + .Details( + "This function splits a string by a delimiter and returns a JSON encoded array of " + "strings. The function is useful for splitting strings and then passing the result to " + "px.pluck_array in order to access individual values of a delimited string.") + .Example(R"doc( + | df = px.DataFrame('http_events', start_time='-5m') + | # Returns By=http://frontend.px.dev;URI=http://testclient.px.dev + | df.xfcc_hdr = px.pluck(df.req_headers, 'X-Forwarded-Client-Cert') + | df.by= px.pluck_array(df.xfcc_hdr, 0) # Returns "By=http://frontend.px.dev" + | df.uri = px.pluck_array(df.xfcc_hdr, 1) # Returns "URI=http://testclient.px.dev" + )doc") + .Arg("input_str", "The string to split.") + .Arg("delimiter", "The string value to split the input string.") + .Returns("A JSON encoded array of the split strings."); + } +}; + /** DocString intentionally omitted, this is a non-public function. This function creates a custom deep link by creating a "script reference" from a label, diff --git a/src/carnot/funcs/builtins/json_ops_test.cc b/src/carnot/funcs/builtins/json_ops_test.cc index 56aa7bd0299..b735e36379e 100644 --- a/src/carnot/funcs/builtins/json_ops_test.cc +++ b/src/carnot/funcs/builtins/json_ops_test.cc @@ -111,6 +111,16 @@ TEST(JSONOps, PluckArrayUDF_index_out_of_bound) { udf_tester.ForInput(kTestJSONArray, 3).Expect(""); } +TEST(JSONOps, SplitUDF_with_present_delimiter) { + auto udf_tester = udf::UDFTester(); + udf_tester.ForInput("foo,bar,baz", ",").Expect(R"(["foo","bar","baz"])"); +} + +TEST(JSONOps, SplitUDF_with_missing_delimiter) { + auto udf_tester = udf::UDFTester(); + udf_tester.ForInput("foo,bar,baz", ";").Expect(R"([])"); +} + TEST(JSONOps, ScriptReferenceUDF_no_args) { auto udf_tester = udf::UDFTester>(); auto res = udf_tester.ForInput("text", "px/script").Result();