Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add split UDF func to ease certain string handling #2039

Merged
merged 5 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/carnot/funcs/builtins/json_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ void RegisterJSONOpsOrDie(udf::Registry* registry) {
registry->RegisterOrDie<PluckAsInt64UDF>("pluck_int64");
registry->RegisterOrDie<PluckAsFloat64UDF>("pluck_float64");
registry->RegisterOrDie<PluckArrayUDF>("pluck_array");
registry->RegisterOrDie<SplitUDF>("split");

// Up to 8 script args are supported for the _script_reference UDF, due to the lack of support for
// variadic UDF arguments in the UDF registry today. We should clean this up if/when variadic UDF
Expand Down
36 changes: 36 additions & 0 deletions src/carnot/funcs/builtins/json_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,42 @@ class PluckArrayUDF : public udf::ScalarUDF {
}
};

class SplitUDF : public udf::ScalarUDF {
public:
StringValue Exec(FunctionContext*, StringValue in, StringValue delimiter) {
rapidjson::StringBuffer sb;
rapidjson::Writer<rapidjson::StringBuffer> writer(sb);
writer.StartArray();

for (absl::string_view part : absl::StrSplit(in.data(), delimiter.data()) {
writer.String(part.data(), part.size());
}

writer.EndArray();
return sb.GetString();
}

static udf::ScalarUDFDocBuilder Doc() {
return udf::ScalarUDFDocBuilder(
"Splits a string by a delimiter and a returns JSON encoded array of strings.")
.Details(
"This function splits a string by a delimiter and returns a JSON encoded array of "
"strings. The function is useful for splitting strings and then passing the result to "
"px.pluck_array in order to access individual values of a delimited string.")
.Example(R"doc(
| df = px.DataFrame('http_events', start_time='-5m')
| # Returns By=http://frontend.px.dev;URI=http://testclient.px.dev
| df.xfcc_hdr = px.pluck(df.req_headers, 'X-Forwarded-Client-Cert')
| df.xfcc_parts = px.split(df.xfcc_hdr, ';')
| df.by = px.pluck_array(df.xfcc_hdr, 0) # Returns "By=http://frontend.px.dev"
| df.uri = px.pluck_array(df.xfcc_hdr, 1) # Returns "URI=http://testclient.px.dev"
)doc")
.Arg("input_str", "The string to split.")
.Arg("delimiter", "The string value to split the input string.")
.Returns("A JSON encoded array of the split strings.");
}
};

/**
DocString intentionally omitted, this is a non-public function.
This function creates a custom deep link by creating a "script reference" from a label,
Expand Down
10 changes: 10 additions & 0 deletions src/carnot/funcs/builtins/json_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,16 @@ TEST(JSONOps, PluckArrayUDF_index_out_of_bound) {
udf_tester.ForInput(kTestJSONArray, 3).Expect("");
}

TEST(JSONOps, SplitUDF_with_present_delimiter) {
auto udf_tester = udf::UDFTester<SplitUDF>();
udf_tester.ForInput("foo,bar,baz", ",").Expect(R"(["foo","bar","baz"])");
}

TEST(JSONOps, SplitUDF_with_missing_delimiter) {
auto udf_tester = udf::UDFTester<SplitUDF>();
udf_tester.ForInput("foo,bar,baz", ";").Expect(R"(["foo,bar,baz"])");
}

TEST(JSONOps, ScriptReferenceUDF_no_args) {
auto udf_tester = udf::UDFTester<ScriptReferenceUDF<>>();
auto res = udf_tester.ForInput("text", "px/script").Result();
Expand Down
Loading