Skip to content

Commit

Permalink
Add split UDF func to ease certain string handling
Browse files Browse the repository at this point in the history
Signed-off-by: Dom Del Nano <[email protected]>
  • Loading branch information
ddelnano committed Oct 8, 2024
1 parent 3c41d55 commit 1860772
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/carnot/funcs/builtins/json_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ void RegisterJSONOpsOrDie(udf::Registry* registry) {
registry->RegisterOrDie<PluckAsInt64UDF>("pluck_int64");
registry->RegisterOrDie<PluckAsFloat64UDF>("pluck_float64");
registry->RegisterOrDie<PluckArrayUDF>("pluck_array");
registry->RegisterOrDie<SplitUDF>("split");

// Up to 8 script args are supported for the _script_reference UDF, due to the lack of support for
// variadic UDF arguments in the UDF registry today. We should clean this up if/when variadic UDF
Expand Down
43 changes: 43 additions & 0 deletions src/carnot/funcs/builtins/json_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,49 @@ class PluckArrayUDF : public udf::ScalarUDF {
}
};

class SplitUDF : public udf::ScalarUDF {
public:
StringValue Exec(FunctionContext*, StringValue in, StringValue delimiter) {
rapidjson::StringBuffer sb;
rapidjson::Writer<rapidjson::StringBuffer> writer(sb);
writer.StartArray();

std::string_view s(in.data(), in.size());
size_t idx = 0;
while (idx < s.size()) {
auto next_idx = s.find(delimiter.data(), idx);
if (next_idx == std::string::npos) {
if (idx > 0) {
writer.String(s.substr(idx).data(), s.substr(idx).size());
}
break;
}
writer.String(s.substr(idx, next_idx - idx).data(), next_idx - idx);
idx = next_idx + delimiter.size();
}
writer.EndArray();
return sb.GetString();
}
static udf::ScalarUDFDocBuilder Doc() {
return udf::ScalarUDFDocBuilder(
"Splits a string by a delimiter and a returns JSON encoded array of strings.")
.Details(
"This function splits a string by a delimiter and returns a JSON encoded array of "
"strings. The function is useful for splitting strings and then passing the result to "
"px.pluck_array in order to access individual values of a delimited string.")
.Example(R"doc(
| df = px.DataFrame('http_events', start_time='-5m')
| # Returns By=http://frontend.px.dev;URI=http://testclient.px.dev
| df.xfcc_hdr = px.pluck(df.req_headers, 'X-Forwarded-Client-Cert')
| df.by= px.pluck_array(df.xfcc_hdr, 0) # Returns "By=http://frontend.px.dev"
| df.uri = px.pluck_array(df.xfcc_hdr, 1) # Returns "URI=http://testclient.px.dev"
)doc")
.Arg("input_str", "The string to split.")
.Arg("delimiter", "The string value to split the input string.")
.Returns("A JSON encoded array of the split strings.");
}
};

/**
DocString intentionally omitted, this is a non-public function.
This function creates a custom deep link by creating a "script reference" from a label,
Expand Down
10 changes: 10 additions & 0 deletions src/carnot/funcs/builtins/json_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,16 @@ TEST(JSONOps, PluckArrayUDF_index_out_of_bound) {
udf_tester.ForInput(kTestJSONArray, 3).Expect("");
}

TEST(JSONOps, SplitUDF_with_present_delimiter) {
auto udf_tester = udf::UDFTester<SplitUDF>();
udf_tester.ForInput("foo,bar,baz", ",").Expect(R"(["foo","bar","baz"])");
}

TEST(JSONOps, SplitUDF_with_missing_delimiter) {
auto udf_tester = udf::UDFTester<SplitUDF>();
udf_tester.ForInput("foo,bar,baz", ";").Expect(R"([])");
}

TEST(JSONOps, ScriptReferenceUDF_no_args) {
auto udf_tester = udf::UDFTester<ScriptReferenceUDF<>>();
auto res = udf_tester.ForInput("text", "px/script").Result();
Expand Down

0 comments on commit 1860772

Please sign in to comment.