diff --git a/velox/docs/functions/spark/json.rst b/velox/docs/functions/spark/json.rst index 48409c3af0c33..838872865f71f 100644 --- a/velox/docs/functions/spark/json.rst +++ b/velox/docs/functions/spark/json.rst @@ -25,8 +25,8 @@ JSON Functions match the field names of the ROW exactly (case-sensitive). Behaviors of the casts are shown with the examples below. :: - SELECT from_json('{"a": true}'); -- {'a'=true} // Output type: ROW(a BOOLEAN) - SELECT from_json('{"a": 1}'); -- {'a'=1} // Output type: ROW(a INTEGER) - SELECT from_json('{"a": 1.0}'); -- {'a'=1.0} // Output type: ROW(a DOUBLE) - SELECT from_json('["name", "age", "id"]'); -- ['name', 'age', 'id'] // Output type: ARRAY(VARCHAR) - SELECT from_json('{"a": 1, "b": 2}'); -- {'a'=1, 'b'=2} // Output type: MAP(VARCHAR,INTEGER) + SELECT from_json('{"a": true}'); -- {'a'=true} // Output type: ROW({"a"}, {BOOLEAN()}) + SELECT from_json('{"a": 1}'); -- {'a'=1} // Output type: ROW({"a"}, {INTEGER()}) + SELECT from_json('{"a": 1.0}'); -- {'a'=1.0} // Output type: ROW({"a"}, {DOUBLE()}) + SELECT from_json('["name", "age", "id"]'); -- ['name', 'age', 'id'] // Output type: ARRAY(VARCHAR()) + SELECT from_json('{"a": 1, "b": 2}'); -- {'a'=1, 'b'=2} // Output type: MAP(VARCHAR(),INTEGER()) diff --git a/velox/functions/sparksql/specialforms/FromJson.cpp b/velox/functions/sparksql/specialforms/FromJson.cpp index 0d59ee5858166..707e4c6662ec0 100644 --- a/velox/functions/sparksql/specialforms/FromJson.cpp +++ b/velox/functions/sparksql/specialforms/FromJson.cpp @@ -19,10 +19,7 @@ #include #include -#include "velox/expression/CastExpr.h" #include "velox/expression/EvalCtx.h" -#include "velox/expression/PeeledEncoding.h" -#include "velox/expression/ScopedVarSetter.h" #include "velox/expression/SpecialForm.h" #include "velox/expression/VectorWriters.h" #include "velox/functions/prestosql/json/SIMDJsonUtil.h" @@ -33,8 +30,9 @@ using namespace facebook::velox::exec; namespace facebook::velox::functions::sparksql { namespace { +/// Struct for extracting JSON data and writing it with type-specific handling. template -struct ParseJsonTypedImpl { +struct ExtractJsonTypedImpl { template static simdjson::error_code apply(Input input, exec::GenericWriter& writer, bool isRoot) { @@ -55,7 +53,7 @@ struct ParseJsonTypedImpl { template struct KindDispatcher { static simdjson::error_code - apply(Input value, exec::GenericWriter& writer, bool isRoot) { + apply(Input value, exec::GenericWriter& writer, bool /*isRoot*/) { SIMDJSON_ASSIGN_OR_RAISE(auto type, value.type()); std::string_view s; switch (type) { @@ -78,7 +76,7 @@ struct ParseJsonTypedImpl { template struct KindDispatcher { static simdjson::error_code - apply(Input value, exec::GenericWriter& writer, bool isRoot) { + apply(Input value, exec::GenericWriter& writer, bool /*isRoot*/) { SIMDJSON_ASSIGN_OR_RAISE(auto type, value.type()); auto& w = writer.castTo(); switch (type) { @@ -96,7 +94,7 @@ struct ParseJsonTypedImpl { template struct KindDispatcher { static simdjson::error_code - apply(Input value, exec::GenericWriter& writer, bool isRoot) { + apply(Input value, exec::GenericWriter& writer, bool /*isRoot*/) { return castJsonToInt(value, writer); } }; @@ -104,7 +102,7 @@ struct ParseJsonTypedImpl { template struct KindDispatcher { static simdjson::error_code - apply(Input value, exec::GenericWriter& writer, bool isRoot) { + apply(Input value, exec::GenericWriter& writer, bool /*isRoot*/) { return castJsonToInt(value, writer); } }; @@ -112,7 +110,7 @@ struct ParseJsonTypedImpl { template struct KindDispatcher { static simdjson::error_code - apply(Input value, exec::GenericWriter& writer, bool isRoot) { + apply(Input value, exec::GenericWriter& writer, bool /*isRoot*/) { return castJsonToInt(value, writer); } }; @@ -120,7 +118,7 @@ struct ParseJsonTypedImpl { template struct KindDispatcher { static simdjson::error_code - apply(Input value, exec::GenericWriter& writer, bool isRoot) { + apply(Input value, exec::GenericWriter& writer, bool /*isRoot*/) { return castJsonToInt(value, writer); } }; @@ -128,7 +126,7 @@ struct ParseJsonTypedImpl { template struct KindDispatcher { static simdjson::error_code - apply(Input value, exec::GenericWriter& writer, bool isRoot) { + apply(Input value, exec::GenericWriter& writer, bool /*isRoot*/) { return castJsonToFloatingPoint(value, writer); } }; @@ -136,7 +134,7 @@ struct ParseJsonTypedImpl { template struct KindDispatcher { static simdjson::error_code - apply(Input value, exec::GenericWriter& writer, bool isRoot) { + apply(Input value, exec::GenericWriter& writer, bool /*isRoot*/) { return castJsonToFloatingPoint(value, writer); } }; @@ -146,7 +144,7 @@ struct ParseJsonTypedImpl { static simdjson::error_code apply(Input value, exec::GenericWriter& writer, bool isRoot) { auto& writerTyped = writer.castTo>(); - auto& elementType = writer.type()->childAt(0); + const auto& elementType = writer.type()->childAt(0); SIMDJSON_ASSIGN_OR_RAISE(auto type, value.type()); if (type == simdjson::ondemand::json_type::array) { SIMDJSON_ASSIGN_OR_RAISE(auto array, value.get_array()); @@ -158,7 +156,7 @@ struct ParseJsonTypedImpl { writerTyped.add_null(); } else { SIMDJSON_TRY(VELOX_DYNAMIC_TYPE_DISPATCH( - ParseJsonTypedImpl::apply, + ExtractJsonTypedImpl::apply, elementType->kind(), element, writerTyped.add_item(), @@ -167,7 +165,7 @@ struct ParseJsonTypedImpl { } } else if (elementType->kind() == TypeKind::ROW && isRoot) { SIMDJSON_TRY(VELOX_DYNAMIC_TYPE_DISPATCH( - ParseJsonTypedImpl::apply, + ExtractJsonTypedImpl::apply, elementType->kind(), value, writerTyped.add_item(), @@ -182,10 +180,9 @@ struct ParseJsonTypedImpl { template struct KindDispatcher { static simdjson::error_code - apply(Input value, exec::GenericWriter& writer, bool isRoot) { + apply(Input value, exec::GenericWriter& writer, bool /*isRoot*/) { auto& writerTyped = writer.castTo>(); - auto& keyType = writer.type()->childAt(0); - auto& valueType = writer.type()->childAt(1); + const auto& valueType = writer.type()->childAt(1); SIMDJSON_ASSIGN_OR_RAISE(auto object, value.get_object()); for (auto fieldResult : object) { SIMDJSON_ASSIGN_OR_RAISE(auto field, fieldResult); @@ -198,7 +195,7 @@ struct ParseJsonTypedImpl { auto writers = writerTyped.add_item(); std::get<0>(writers).castTo().append(key); SIMDJSON_TRY(VELOX_DYNAMIC_TYPE_DISPATCH( - ParseJsonTypedImpl::apply, + ExtractJsonTypedImpl::apply, valueType->kind(), field.value(), std::get<1>(writers), @@ -213,7 +210,7 @@ struct ParseJsonTypedImpl { struct KindDispatcher { static simdjson::error_code apply(Input value, exec::GenericWriter& writer, bool isRoot) { - auto& rowType = writer.type()->asRow(); + const auto& rowType = writer.type()->asRow(); auto& writerTyped = writer.castTo(); if (value.type().error() != ::simdjson::SUCCESS) { writerTyped.set_null_at(0); @@ -245,7 +242,7 @@ struct ParseJsonTypedImpl { it->second = -1; auto res = VELOX_DYNAMIC_TYPE_DISPATCH( - ParseJsonTypedImpl::apply, + ExtractJsonTypedImpl::apply, rowType.childAt(index)->kind(), field.value(), writerTyped.get_writer_at(index), @@ -257,12 +254,14 @@ struct ParseJsonTypedImpl { } } - for (const auto& [key, index] : fieldIndices) { + for (const auto& [_, index] : fieldIndices) { if (index >= 0) { writerTyped.set_null_at(index); } } } else { + // Handle other JSON types: set null to the writer if it's the root doc, + // otherwise return INCORRECT_TYPE to the caller. if (isRoot) { writerTyped.set_null_at(0); return simdjson::SUCCESS; @@ -274,23 +273,6 @@ struct ParseJsonTypedImpl { } }; - static simdjson::simdjson_result rawJson( - Input value, - simdjson::ondemand::json_type type) { - switch (type) { - case simdjson::ondemand::json_type::array: { - SIMDJSON_ASSIGN_OR_RAISE(auto array, value.get_array()); - return array.raw_json(); - } - case simdjson::ondemand::json_type::object: { - SIMDJSON_ASSIGN_OR_RAISE(auto object, value.get_object()); - return object.raw_json(); - } - default: - return value.raw_json_token(); - } - } - template static simdjson::error_code castJsonToInt( Input value, @@ -307,7 +289,6 @@ struct ParseJsonTypedImpl { default: return simdjson::INCORRECT_TYPE; } - break; } default: return simdjson::INCORRECT_TYPE; @@ -315,6 +296,8 @@ struct ParseJsonTypedImpl { return simdjson::SUCCESS; } + // Casts a JSON value to a float point, handling both numeric Special cases + // for NaN and Infinity. template static simdjson::error_code castJsonToFloatingPoint( Input value, @@ -368,8 +351,9 @@ struct ParseJsonTypedImpl { /// - Failure Handling: Returns `NULL` for invalid JSON or incompatible values. /// - Boolean: Only `true` and `false` are valid; others return `NULL`. /// - Integral Types: Accepts only integers; floats or strings return `NULL`. -/// - Float/Double: All numbers are valid; strings like `"NaN"` , `"INF"` -/// `"Infinity"` are accepted, others return `NULL`. +/// - Float/Double: All numbers are valid; strings like `"NaN"`, `"+INF"`, +/// `"+Infinity"`, `"Infinity"`, `"-INF"`, +/// `"-Infinity"` are accepted, others return `NULL`. /// - Array: Accepts JSON objects only if the array is the root type with ROW /// child type. /// - Map: Keys must be `VARCHAR` type. @@ -425,7 +409,7 @@ class FromJsonFunction final : public exec::VectorFunction { context.applyToSelectedNoThrow(rows, [&](auto row) { writer.setOffset(row); if (error != simdjson::SUCCESS || - paseJsonOneRow(jsonDoc, writer) != simdjson::SUCCESS) { + extractJsonToWriter(jsonDoc, writer) != simdjson::SUCCESS) { writer.commitNull(); } }); @@ -448,7 +432,7 @@ class FromJsonFunction final : public exec::VectorFunction { if (inputVector->isNullAt(row)) { return; } - auto& input = inputVector->valueAt(row); + const auto& input = inputVector->valueAt(row); maxSize = std::max(maxSize, input.size()); }); paddedInput_.resize(maxSize + simdjson::SIMDJSON_PADDING); @@ -458,52 +442,61 @@ class FromJsonFunction final : public exec::VectorFunction { writer.commitNull(); return; } - auto& input = inputVector->valueAt(row); + const auto& input = inputVector->valueAt(row); memcpy(paddedInput_.data(), input.data(), input.size()); simdjson::padded_string_view paddedInput( paddedInput_.data(), input.size(), paddedInput_.size()); simdjson::ondemand::document doc; auto error = simdjsonParse(paddedInput).get(doc); if (error != simdjson::SUCCESS || - paseJsonOneRow(doc, writer) != simdjson::SUCCESS) { + extractJsonToWriter(doc, writer) != simdjson::SUCCESS) { writer.commitNull(); } }); writer.finish(); } - static simdjson::error_code paseJsonOneRow( + // Extracts data from json doc and writes it to writer. + static simdjson::error_code extractJsonToWriter( simdjson::ondemand::document& doc, exec::VectorWriter& writer) { if (doc.is_null()) { writer.commitNull(); } else { SIMDJSON_TRY( - ParseJsonTypedImpl::apply( + ExtractJsonTypedImpl::apply( doc, writer.current(), true)); writer.commit(true); } return simdjson::SUCCESS; } + // The buffer with extra bytes for parser::parse(), mutable std::string paddedInput_; }; +/// Determines whether a given type is supported. +/// @param isRootType. A flag indicating whether the type is the root type in +/// the evaluation context. Only ROW, ARRAY, and MAP are allowed as root types; +/// this flag helps differentiate such cases. bool isSupportedType(const TypePtr& type, bool isRootType) { switch (type->kind()) { - case TypeKind::ARRAY: + case TypeKind::ARRAY: { return isSupportedType(type->childAt(0), false); - case TypeKind::ROW: - for (const auto& child : type->as().children()) { + } + case TypeKind::ROW: { + for (const auto& child : asRowType(type)->children()) { if (!isSupportedType(child, false)) { return false; } } return true; - case TypeKind::MAP: + } + case TypeKind::MAP: { return ( type->childAt(0)->kind() == TypeKind::VARCHAR && isSupportedType(type->childAt(1), false)); + } case TypeKind::BIGINT: { if (type->isDecimal()) { return false; @@ -547,9 +540,8 @@ exec::ExprPtr FromJsonCallToSpecialForm::constructSpecialForm( TypeKind::VARCHAR, "The first argument of from_json should be of varchar type."); - if (!isSupportedType(type, true)) { - VELOX_UNSUPPORTED("Unsupported type {}.", type->toString()); - } + VELOX_USER_CHECK( + isSupportedType(type, true), "Unsupported type {}.", type->toString()); std::shared_ptr func; if (type->kind() == TypeKind::ARRAY) {