diff --git a/velox/docs/functions/spark/json.rst b/velox/docs/functions/spark/json.rst index 5f853f0698e1..3d9b000c1336 100644 --- a/velox/docs/functions/spark/json.rst +++ b/velox/docs/functions/spark/json.rst @@ -44,3 +44,31 @@ JSON Functions SELECT json_object_keys(''); -- NULL SELECT json_object_keys(1); -- NULL SELECT json_object_keys('"hello"'); -- NULL + +.. spark:function:: from_json(jsonString) -> array / map / row + + Casts ``jsonString`` to an ARRAY, MAP, or ROW type, with the output type + determined by the expression. Returns NULL, if the input string is unparsable. + Supported element types include BOOLEAN, TINYINT, SMALLINT, INTEGER, BIGINT, + REAL, DOUBLE, VARCHAR, ARRAY, MAP and ROW. When casting to ARRAY or MAP, + the element type of the array or the value type of the map must be one of + these supported types, and for maps, the key type must be VARCHAR. Casting + to ROW supports only JSON objects. + The current implementation has the following limitations. + + * Does not support user provided options. + + * Only supports partial result mode, which requires spark configuration spark.sql.json.enablePartialResults = true. + + * Does not support single quotes as delimiters. + + * Does not support schemas that include a corrupt record column. + + Behaviors of the casts are shown with the examples below. :: + + SELECT from_json('{"a": true}'); -- {'a'=true} // Output type: ROW({"a"}, {BOOLEAN()}) + SELECT from_json('{"a": 1}'); -- {'a'=1} // Output type: ROW({"a"}, {INTEGER()}) + SELECT from_json('{"a": 1.0}'); -- {'a'=1.0} // Output type: ROW({"a"}, {DOUBLE()}) + SELECT from_json('["name", "age", "id"]'); -- ['name', 'age', 'id'] // Output type: ARRAY(VARCHAR()) + SELECT from_json('{"a": 1, "b": 2}'); -- {'a'=1, 'b'=2} // Output type: MAP(VARCHAR(),INTEGER()) + SELECT from_json('{"a": {"b": 1}}'); -- {'a'={b=1}} // Output type: ROW({"a"}, {ROW({"b"}, {INTEGER()})}) diff --git a/velox/functions/sparksql/registration/RegisterSpecialForm.cpp b/velox/functions/sparksql/registration/RegisterSpecialForm.cpp index d9f12abe4f80..6d4cbd8b81e4 100644 --- a/velox/functions/sparksql/registration/RegisterSpecialForm.cpp +++ b/velox/functions/sparksql/registration/RegisterSpecialForm.cpp @@ -18,6 +18,7 @@ #include "velox/expression/SpecialFormRegistry.h" #include "velox/functions/sparksql/specialforms/AtLeastNNonNulls.h" #include "velox/functions/sparksql/specialforms/DecimalRound.h" +#include "velox/functions/sparksql/specialforms/FromJson.h" #include "velox/functions/sparksql/specialforms/MakeDecimal.h" #include "velox/functions/sparksql/specialforms/SparkCastExpr.h" @@ -44,6 +45,9 @@ void registerSpecialFormGeneralFunctions(const std::string& prefix) { "cast", std::make_unique()); registerFunctionCallToSpecialForm( "try_cast", std::make_unique()); + exec::registerFunctionCallToSpecialForm( + FromJsonCallToSpecialForm::kFromJson, + std::make_unique()); } } // namespace sparksql } // namespace facebook::velox::functions diff --git a/velox/functions/sparksql/specialforms/CMakeLists.txt b/velox/functions/sparksql/specialforms/CMakeLists.txt index e141e0074bc8..cfce8dc29df2 100644 --- a/velox/functions/sparksql/specialforms/CMakeLists.txt +++ b/velox/functions/sparksql/specialforms/CMakeLists.txt @@ -16,9 +16,10 @@ velox_add_library( velox_functions_spark_specialforms AtLeastNNonNulls.cpp DecimalRound.cpp + FromJson.cpp MakeDecimal.cpp SparkCastExpr.cpp SparkCastHooks.cpp) velox_link_libraries(velox_functions_spark_specialforms fmt::fmt - velox_expression) + velox_functions_json velox_expression) diff --git a/velox/functions/sparksql/specialforms/FromJson.cpp b/velox/functions/sparksql/specialforms/FromJson.cpp new file mode 100644 index 000000000000..a968d362ca57 --- /dev/null +++ b/velox/functions/sparksql/specialforms/FromJson.cpp @@ -0,0 +1,577 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "velox/functions/sparksql/specialforms/FromJson.h" + +#include +#include + +#include "velox/expression/EvalCtx.h" +#include "velox/expression/SpecialForm.h" +#include "velox/expression/VectorWriters.h" +#include "velox/functions/prestosql/json/SIMDJsonUtil.h" + +using namespace facebook::velox::exec; + +namespace facebook::velox::functions::sparksql { +namespace { + +// Struct for extracting JSON data and writing it with type-specific handling. +template +struct ExtractJsonTypeImpl { + template + static simdjson::error_code + apply(Input input, exec::GenericWriter& writer, bool isRoot) { + return KindDispatcher::apply(input, writer, isRoot); + } + + private: + // Dummy is needed because full/explicit specialization is not allowed inside + // class. + template + struct KindDispatcher { + static simdjson::error_code apply(Input, exec::GenericWriter&, bool) { + VELOX_NYI("Parse json to {} is not supported.", TypeTraits::name); + return simdjson::error_code::UNEXPECTED_ERROR; + } + }; + + template + struct KindDispatcher { + static simdjson::error_code + apply(Input value, exec::GenericWriter& writer, bool /*isRoot*/) { + SIMDJSON_ASSIGN_OR_RAISE(auto type, value.type()); + std::string_view s; + if (type == simdjson::ondemand::json_type::string) { + SIMDJSON_ASSIGN_OR_RAISE(s, value.get_string()); + } else { + s = value.raw_json(); + } + writer.castTo().append(s); + return simdjson::SUCCESS; + } + }; + + template + struct KindDispatcher { + static simdjson::error_code + apply(Input value, exec::GenericWriter& writer, bool /*isRoot*/) { + SIMDJSON_ASSIGN_OR_RAISE(auto type, value.type()); + if (type == simdjson::ondemand::json_type::boolean) { + auto& w = writer.castTo(); + SIMDJSON_ASSIGN_OR_RAISE(w, value.get_bool()); + return simdjson::SUCCESS; + } + return simdjson::INCORRECT_TYPE; + } + }; + + template + struct KindDispatcher { + static simdjson::error_code + apply(Input value, exec::GenericWriter& writer, bool /*isRoot*/) { + return castJsonToInt(value, writer); + } + }; + + template + struct KindDispatcher { + static simdjson::error_code + apply(Input value, exec::GenericWriter& writer, bool /*isRoot*/) { + return castJsonToInt(value, writer); + } + }; + + template + struct KindDispatcher { + static simdjson::error_code + apply(Input value, exec::GenericWriter& writer, bool /*isRoot*/) { + return castJsonToInt(value, writer); + } + }; + + template + struct KindDispatcher { + static simdjson::error_code + apply(Input value, exec::GenericWriter& writer, bool /*isRoot*/) { + return castJsonToInt(value, writer); + } + }; + + template + struct KindDispatcher { + static simdjson::error_code + apply(Input value, exec::GenericWriter& writer, bool /*isRoot*/) { + return castJsonToFloatingPoint(value, writer); + } + }; + + template + struct KindDispatcher { + static simdjson::error_code + apply(Input value, exec::GenericWriter& writer, bool /*isRoot*/) { + return castJsonToFloatingPoint(value, writer); + } + }; + + template + struct KindDispatcher { + static simdjson::error_code + apply(Input value, exec::GenericWriter& writer, bool isRoot) { + auto& writerTyped = writer.castTo>(); + const auto& elementType = writer.type()->childAt(0); + SIMDJSON_ASSIGN_OR_RAISE(auto type, value.type()); + if (type == simdjson::ondemand::json_type::array) { + SIMDJSON_ASSIGN_OR_RAISE(auto array, value.get_array()); + for (const auto& elementResult : array) { + SIMDJSON_ASSIGN_OR_RAISE(auto element, elementResult); + // If casting to array of JSON, nulls in array elements should become + // the JSON text "null". + if (element.is_null()) { + writerTyped.add_null(); + } else { + SIMDJSON_TRY(VELOX_DYNAMIC_TYPE_DISPATCH( + ExtractJsonTypeImpl::apply, + elementType->kind(), + element, + writerTyped.add_item(), + false)); + } + } + } else if (elementType->kind() == TypeKind::ROW && isRoot) { + SIMDJSON_TRY(VELOX_DYNAMIC_TYPE_DISPATCH( + ExtractJsonTypeImpl::apply, + elementType->kind(), + value, + writerTyped.add_item(), + false)); + } else { + return simdjson::INCORRECT_TYPE; + } + return simdjson::SUCCESS; + } + }; + + template + struct KindDispatcher { + static simdjson::error_code + apply(Input value, exec::GenericWriter& writer, bool /*isRoot*/) { + auto& writerTyped = writer.castTo>(); + const auto& valueType = writer.type()->childAt(1); + SIMDJSON_ASSIGN_OR_RAISE(auto object, value.get_object()); + for (const auto& fieldResult : object) { + SIMDJSON_ASSIGN_OR_RAISE(auto field, fieldResult); + SIMDJSON_ASSIGN_OR_RAISE(auto key, field.unescaped_key(true)); + // If casting to map of JSON values, nulls in map values should become + // the JSON text "null". + if (field.value().is_null()) { + writerTyped.add_null().castTo().append(key); + } else { + auto writers = writerTyped.add_item(); + std::get<0>(writers).castTo().append(key); + SIMDJSON_TRY(VELOX_DYNAMIC_TYPE_DISPATCH( + ExtractJsonTypeImpl::apply, + valueType->kind(), + field.value(), + std::get<1>(writers), + false)); + } + } + return simdjson::SUCCESS; + } + }; + + template + struct KindDispatcher { + static simdjson::error_code + apply(Input value, exec::GenericWriter& writer, bool isRoot) { + const auto& rowType = writer.type()->asRow(); + auto& writerTyped = writer.castTo(); + if (value.type().error() != ::simdjson::SUCCESS) { + writerTyped.set_null_at(0); + return simdjson::SUCCESS; + } + const auto type = value.type().value_unsafe(); + if (type == simdjson::ondemand::json_type::object) { + SIMDJSON_ASSIGN_OR_RAISE(auto object, value.get_object()); + + const auto& names = rowType.names(); + bool allFieldsAreAscii = + std::all_of(names.begin(), names.end(), [](const auto& name) { + return functions::stringCore::isAscii(name.data(), name.size()); + }); + + auto fieldIndices = makeFieldIndicesMap(rowType, allFieldsAreAscii); + + std::string key; + for (const auto& fieldResult : object) { + if (fieldResult.error() != ::simdjson::SUCCESS) { + continue; + } + auto field = fieldResult.value_unsafe(); + if (!field.value().is_null()) { + SIMDJSON_ASSIGN_OR_RAISE(key, field.unescaped_key(true)); + + if (allFieldsAreAscii) { + folly::toLowerAscii(key); + } else { + boost::algorithm::to_lower(key); + } + auto it = fieldIndices.find(key); + if (it != fieldIndices.end() && it->second >= 0) { + const auto index = it->second; + it->second = -1; + + const auto res = VELOX_DYNAMIC_TYPE_DISPATCH( + ExtractJsonTypeImpl::apply, + rowType.childAt(index)->kind(), + field.value(), + writerTyped.get_writer_at(index), + false); + if (res != simdjson::SUCCESS) { + writerTyped.set_null_at(index); + } + } + } + } + + for (const auto& [_, index] : fieldIndices) { + if (index >= 0) { + writerTyped.set_null_at(index); + } + } + } else { + // Handle other JSON types: set null to the writer if it's the root doc, + // otherwise return INCORRECT_TYPE to the caller. + if (isRoot) { + writerTyped.set_null_at(0); + return simdjson::SUCCESS; + } else { + return simdjson::INCORRECT_TYPE; + } + } + return simdjson::SUCCESS; + } + }; + + template + static simdjson::error_code castJsonToInt( + Input value, + exec::GenericWriter& writer) { + SIMDJSON_ASSIGN_OR_RAISE(auto type, value.type()); + switch (type) { + case simdjson::ondemand::json_type::number: { + SIMDJSON_ASSIGN_OR_RAISE(auto num, value.get_number()); + switch (num.get_number_type()) { + case simdjson::ondemand::number_type::signed_integer: + return convertIfInRange(num.get_int64(), writer); + case simdjson::ondemand::number_type::unsigned_integer: + return simdjson::NUMBER_OUT_OF_RANGE; + default: + return simdjson::INCORRECT_TYPE; + } + } + default: + return simdjson::INCORRECT_TYPE; + } + return simdjson::SUCCESS; + } + + // Casts a JSON value to a float point, handling both numeric special cases + // for NaN and Infinity. + template + static simdjson::error_code castJsonToFloatingPoint( + Input value, + exec::GenericWriter& writer) { + SIMDJSON_ASSIGN_OR_RAISE(auto type, value.type()); + switch (type) { + case simdjson::ondemand::json_type::number: { + SIMDJSON_ASSIGN_OR_RAISE(auto num, value.get_double()); + return convertIfInRange(num, writer); + } + case simdjson::ondemand::json_type::string: { + SIMDJSON_ASSIGN_OR_RAISE(auto s, value.get_string()); + constexpr T kNaN = std::numeric_limits::quiet_NaN(); + constexpr T kInf = std::numeric_limits::infinity(); + if (s == "NaN") { + writer.castTo() = kNaN; + } else if (s == "+INF" || s == "+Infinity" || s == "Infinity") { + writer.castTo() = kInf; + } else if (s == "-INF" || s == "-Infinity") { + writer.castTo() = -kInf; + } else { + return simdjson::INCORRECT_TYPE; + } + break; + } + default: + return simdjson::INCORRECT_TYPE; + } + return simdjson::SUCCESS; + } + + template + static simdjson::error_code convertIfInRange( + From x, + exec::GenericWriter& writer) { + static_assert(std::is_signed_v && std::is_signed_v); + if constexpr (sizeof(To) < sizeof(From)) { + constexpr From kMin = std::numeric_limits::lowest(); + constexpr From kMax = std::numeric_limits::max(); + if (!(kMin <= x && x <= kMax)) { + return simdjson::NUMBER_OUT_OF_RANGE; + } + } + writer.castTo() = x; + return simdjson::SUCCESS; + } + + // Creates a map of lower case field names to their indices in the row type. + static folly::F14FastMap makeFieldIndicesMap( + const RowType& rowType, + bool allFieldsAreAscii) { + folly::F14FastMap fieldIndices; + const auto size = rowType.size(); + for (auto i = 0; i < size; ++i) { + std::string key = rowType.nameOf(i); + if (allFieldsAreAscii) { + folly::toLowerAscii(key); + } else { + boost::algorithm::to_lower(key); + } + + fieldIndices[key] = i; + } + + return fieldIndices; + } +}; + +/// @brief Parses a JSON string into the specified data type. Supports ROW, +/// ARRAY, and MAP as root types. Key Behavior: +/// - Failure Handling: Returns `NULL` for invalid JSON or incompatible values. +/// - Boolean: Only `true` and `false` are valid; others return `NULL`. +/// - Integral Types: Accepts only integers; floats or strings return `NULL`. +/// - Float/Double: All numbers are valid; strings like `"NaN"`, `"+INF"`, +/// `"+Infinity"`, `"Infinity"`, `"-INF"`, `"-Infinity"` are accepted, others +/// return `NULL`. +/// - Array: Accepts JSON objects only if the array is the root type with ROW +/// child type. +/// - Map: Keys must be `VARCHAR` type. +/// - Row: Partial parsing is supported, but JSON arrays cannot be parsed into a +/// ROW type. +template +class FromJsonFunction final : public exec::VectorFunction { + public: + void apply( + const SelectivityVector& rows, + std::vector& args, // Not using const ref so we can reuse args + const TypePtr& outputType, + exec::EvalCtx& context, + VectorPtr& result) const final { + VELOX_USER_CHECK( + args[0]->isConstantEncoding() || args[0]->isFlatEncoding(), + "Single-arg deterministic functions receive their only argument as flat or constant vector."); + context.ensureWritable(rows, outputType, result); + result->clearNulls(rows); + if (args[0]->isConstantEncoding()) { + parseJsonConstant(args[0], context, rows, *result); + } else { + parseJsonFlat(args[0], context, rows, *result); + } + } + + private: + void parseJsonConstant( + VectorPtr& input, + exec::EvalCtx& context, + const SelectivityVector& rows, + BaseVector& result) const { + // Result is guaranteed to be a flat writable vector. + auto* flatResult = result.as::type>(); + exec::VectorWriter writer; + writer.init(*flatResult); + const auto constInput = input->asUnchecked>(); + if (constInput->isNullAt(0)) { + context.applyToSelectedNoThrow(rows, [&](auto row) { + writer.setOffset(row); + writer.commitNull(); + }); + } else { + const auto constant = constInput->valueAt(0); + paddedInput_.resize(constant.size() + simdjson::SIMDJSON_PADDING); + memcpy(paddedInput_.data(), constant.data(), constant.size()); + simdjson::padded_string_view paddedInput( + paddedInput_.data(), constant.size(), paddedInput_.size()); + + simdjson::ondemand::document jsonDoc; + auto error = simdjsonParse(paddedInput).get(jsonDoc); + + context.applyToSelectedNoThrow(rows, [&](auto row) { + writer.setOffset(row); + if (error != simdjson::SUCCESS || + extractJsonToWriter(jsonDoc, writer) != simdjson::SUCCESS) { + writer.commitNull(); + } + }); + } + + writer.finish(); + } + + void parseJsonFlat( + VectorPtr& input, + exec::EvalCtx& context, + const SelectivityVector& rows, + BaseVector& result) const { + auto* flatResult = result.as::type>(); + exec::VectorWriter writer; + writer.init(*flatResult); + auto* inputVector = input->asUnchecked>(); + size_t maxSize = 0; + rows.applyToSelected([&](auto row) { + if (inputVector->isNullAt(row)) { + return; + } + const auto& input = inputVector->valueAt(row); + maxSize = std::max(maxSize, input.size()); + }); + paddedInput_.resize(maxSize + simdjson::SIMDJSON_PADDING); + context.applyToSelectedNoThrow(rows, [&](auto row) { + writer.setOffset(row); + if (inputVector->isNullAt(row)) { + writer.commitNull(); + return; + } + const auto& input = inputVector->valueAt(row); + memcpy(paddedInput_.data(), input.data(), input.size()); + simdjson::padded_string_view paddedInput( + paddedInput_.data(), input.size(), paddedInput_.size()); + simdjson::ondemand::document doc; + auto error = simdjsonParse(paddedInput).get(doc); + if (error != simdjson::SUCCESS || + extractJsonToWriter(doc, writer) != simdjson::SUCCESS) { + writer.commitNull(); + } + }); + writer.finish(); + } + + // Extracts data from json doc and writes it to writer. + static simdjson::error_code extractJsonToWriter( + simdjson::ondemand::document& doc, + exec::VectorWriter& writer) { + if (doc.is_null()) { + writer.commitNull(); + } else { + SIMDJSON_TRY( + ExtractJsonTypeImpl::apply( + doc, writer.current(), true)); + writer.commit(true); + } + return simdjson::SUCCESS; + } + + // The buffer with extra bytes for parser::parse(), + mutable std::string paddedInput_; +}; + +/// Determines whether a given type is supported. +/// @param isRootType. A flag indicating whether the type is the root type in +/// the evaluation context. Only ROW, ARRAY, and MAP are allowed as root types; +/// this flag helps differentiate such cases. +bool isSupportedType(const TypePtr& type, bool isRootType) { + switch (type->kind()) { + case TypeKind::ARRAY: { + return isSupportedType(type->childAt(0), false); + } + case TypeKind::ROW: { + for (const auto& child : asRowType(type)->children()) { + if (!isSupportedType(child, false)) { + return false; + } + } + return true; + } + case TypeKind::MAP: { + return ( + type->childAt(0)->kind() == TypeKind::VARCHAR && + isSupportedType(type->childAt(1), false)); + } + case TypeKind::BIGINT: { + if (type->isDecimal()) { + return false; + } + return !isRootType; + } + case TypeKind::INTEGER: { + if (type->isDate()) { + return false; + } + return !isRootType; + } + case TypeKind::BOOLEAN: + case TypeKind::SMALLINT: + case TypeKind::TINYINT: + case TypeKind::DOUBLE: + case TypeKind::REAL: + case TypeKind::VARCHAR: { + return !isRootType; + } + default: + return false; + } +} + +} // namespace + +TypePtr FromJsonCallToSpecialForm::resolveType( + const std::vector& /*argTypes*/) { + VELOX_FAIL("from_json function does not support type resolution."); +} + +exec::ExprPtr FromJsonCallToSpecialForm::constructSpecialForm( + const TypePtr& type, + std::vector&& args, + bool trackCpuUsage, + const core::QueryConfig& /*config*/) { + VELOX_USER_CHECK_EQ(args.size(), 1, "from_json expects one argument."); + VELOX_USER_CHECK_EQ( + args[0]->type()->kind(), + TypeKind::VARCHAR, + "The first argument of from_json should be of varchar type."); + + VELOX_USER_CHECK( + isSupportedType(type, true), "Unsupported type {}.", type->toString()); + + std::shared_ptr func; + if (type->kind() == TypeKind::ARRAY) { + func = std::make_shared>(); + } else if (type->kind() == TypeKind::MAP) { + func = std::make_shared>(); + } else { + func = std::make_shared>(); + } + + return std::make_shared( + type, + std::move(args), + func, + exec::VectorFunctionMetadata{}, + kFromJson, + trackCpuUsage); +} +} // namespace facebook::velox::functions::sparksql diff --git a/velox/functions/sparksql/specialforms/FromJson.h b/velox/functions/sparksql/specialforms/FromJson.h new file mode 100644 index 000000000000..9a8c6e5efcdd --- /dev/null +++ b/velox/functions/sparksql/specialforms/FromJson.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "velox/expression/FunctionCallToSpecialForm.h" + +namespace facebook::velox::functions::sparksql { + +class FromJsonCallToSpecialForm : public exec::FunctionCallToSpecialForm { + public: + // Throws not supported exception. + TypePtr resolveType(const std::vector& argTypes) override; + + /// @brief Returns an expression for from_json special form. The expression + /// is a regular expression based on a custom VectorFunction implementation. + exec::ExprPtr constructSpecialForm( + const TypePtr& type, + std::vector&& args, + bool trackCpuUsage, + const core::QueryConfig& config) override; + + static constexpr const char* kFromJson = "from_json"; +}; + +} // namespace facebook::velox::functions::sparksql diff --git a/velox/functions/sparksql/tests/CMakeLists.txt b/velox/functions/sparksql/tests/CMakeLists.txt index 57fdcbf8253e..73606671dd26 100644 --- a/velox/functions/sparksql/tests/CMakeLists.txt +++ b/velox/functions/sparksql/tests/CMakeLists.txt @@ -33,6 +33,7 @@ add_executable( DecimalRoundTest.cpp DecimalUtilTest.cpp ElementAtTest.cpp + FromJsonTest.cpp GetJsonObjectTest.cpp HashTest.cpp InTest.cpp diff --git a/velox/functions/sparksql/tests/FromJsonTest.cpp b/velox/functions/sparksql/tests/FromJsonTest.cpp new file mode 100644 index 000000000000..84dbc55f7a26 --- /dev/null +++ b/velox/functions/sparksql/tests/FromJsonTest.cpp @@ -0,0 +1,249 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/functions/sparksql/tests/SparkFunctionBaseTest.h" + +using namespace facebook::velox::test; + +namespace facebook::velox::functions::sparksql::test { +namespace { +constexpr float kNaNFloat = std::numeric_limits::quiet_NaN(); +constexpr float kInfFloat = std::numeric_limits::infinity(); +constexpr double kNaNDouble = std::numeric_limits::quiet_NaN(); +constexpr double kInfDouble = std::numeric_limits::infinity(); + +class FromJsonTest : public SparkFunctionBaseTest { + protected: + core::CallTypedExprPtr createFromJson(const TypePtr& outputType) { + std::vector inputs = { + std::make_shared(VARCHAR(), "c0")}; + + return std::make_shared( + outputType, std::move(inputs), "from_json"); + } + + void testFromJson(const VectorPtr& input, const VectorPtr& expected) { + auto expr = createFromJson(expected->type()); + testEncodings(expr, {input}, expected); + } +}; + +TEST_F(FromJsonTest, basicStruct) { + auto expected = makeFlatVector({1, 2, 3}); + auto input = makeFlatVector( + {R"({"Id": 1})", R"({"Id": 2})", R"({"Id": 3})"}); + testFromJson(input, makeRowVector({"Id"}, {expected})); +} + +TEST_F(FromJsonTest, basicArray) { + auto expected = makeArrayVector({{1}, {2}, {}}); + auto input = makeFlatVector({R"([1])", R"([2])", R"([])"}); + testFromJson(input, expected); +} + +TEST_F(FromJsonTest, basicMap) { + auto expected = makeMapVector( + {{{"a", 1}}, {{"b", 2}}, {{"c", 3}}, {{"3", 3}}}); + auto input = makeFlatVector( + {R"({"a": 1})", R"({"b": 2})", R"({"c": 3})", R"({"3": 3})"}); + testFromJson(input, expected); +} + +TEST_F(FromJsonTest, basicBool) { + auto expected = makeNullableFlatVector( + {true, false, std::nullopt, std::nullopt, std::nullopt}); + auto input = makeFlatVector( + {R"({"a": true})", + R"({"a": false})", + R"({"a": 1})", + R"({"a": 0.0})", + R"({"a": "true"})"}); + testFromJson(input, makeRowVector({"a"}, {expected})); +} + +TEST_F(FromJsonTest, basicTinyInt) { + auto expected = makeNullableFlatVector( + {1, std::nullopt, std::nullopt, std::nullopt, std::nullopt}); + auto input = makeFlatVector( + {R"({"a": 1})", + R"({"a": -129})", + R"({"a": 128})", + R"({"a": 1.0})", + R"({"a": "1"})"}); + testFromJson(input, makeRowVector({"a"}, {expected})); +} + +TEST_F(FromJsonTest, basicSmallInt) { + auto expected = makeNullableFlatVector( + {1, std::nullopt, std::nullopt, std::nullopt, std::nullopt}); + auto input = makeFlatVector( + {R"({"a": 1})", + R"({"a": -32769})", + R"({"a": 32768})", + R"({"a": 1.0})", + R"({"a": "1"})"}); + testFromJson(input, makeRowVector({"a"}, {expected})); +} + +TEST_F(FromJsonTest, basicInt) { + auto expected = makeNullableFlatVector( + {1, std::nullopt, std::nullopt, std::nullopt, std::nullopt}); + auto input = makeFlatVector( + {R"({"a": 1})", + R"({"a": -2147483649})", + R"({"a": 2147483648})", + R"({"a": 2.0})", + R"({"a": "3"})"}); + testFromJson(input, makeRowVector({"a"}, {expected})); +} + +TEST_F(FromJsonTest, basicBigInt) { + auto expected = + makeNullableFlatVector({1, std::nullopt, std::nullopt}); + auto input = makeFlatVector( + {R"({"a": 1})", R"({"a": 2.0})", R"({"a": "3"})"}); + testFromJson(input, makeRowVector({"a"}, {expected})); +} + +TEST_F(FromJsonTest, basicFloat) { + auto expected = makeNullableFlatVector( + {1.0, + 2.0, + std::nullopt, + kNaNFloat, + -kInfFloat, + -kInfFloat, + kInfFloat, + kInfFloat, + kInfFloat}); + auto input = makeFlatVector( + {R"({"a": 1})", + R"({"a": 2.0})", + R"({"a": "3"})", + R"({"a": "NaN"})", + R"({"a": "-Infinity"})", + R"({"a": "-INF"})", + R"({"a": "+Infinity"})", + R"({"a": "Infinity"})", + R"({"a": "+INF"})"}); + testFromJson(input, makeRowVector({"a"}, {expected})); +} + +TEST_F(FromJsonTest, basicDouble) { + auto expected = makeNullableFlatVector( + {1.0, + 2.0, + std::nullopt, + kNaNDouble, + -kInfDouble, + -kInfDouble, + kInfDouble, + kInfDouble, + kInfDouble}); + auto input = makeFlatVector( + {R"({"a": 1})", + R"({"a": 2.0})", + R"({"a": "3"})", + R"({"a": "NaN"})", + R"({"a": "-Infinity"})", + R"({"a": "-INF"})", + R"({"a": "+Infinity"})", + R"({"a": "Infinity"})", + R"({"a": "+INF"})"}); + testFromJson(input, makeRowVector({"a"}, {expected})); +} + +TEST_F(FromJsonTest, basicString) { + auto expected = makeNullableFlatVector( + {"1", "2.0", "true", "{\"b\": \"test\"}", "[1, 2]"}); + auto input = makeFlatVector( + {R"({"a": 1})", + R"({"a": 2.0})", + R"({"a": "true"})", + R"({"a": {"b": "test"}})", + R"({"a": [1, 2]})"}); + testFromJson(input, makeRowVector({"a"}, {expected})); +} + +TEST_F(FromJsonTest, nestedComplexType) { + auto rowVector = makeRowVector({"a"}, {makeFlatVector({1, 2, 2})}); + std::vector offsets; + offsets.push_back(0); + offsets.push_back(1); + offsets.push_back(2); + auto arrayVector = makeArrayVector(offsets, rowVector); + auto input = makeFlatVector( + {R"({"a": 1})", R"([{"a": 2}])", R"([{"a": 2}])"}); + testFromJson(input, arrayVector); +} + +TEST_F(FromJsonTest, nullOnFailure) { + auto expected = makeNullableFlatVector({1, std::nullopt, 3}); + auto input = + makeFlatVector({R"({"a": 1})", R"({"a" 2})", R"({"a": 3})"}); + testFromJson(input, makeRowVector({"a"}, {expected})); +} + +TEST_F(FromJsonTest, structEmptyArray) { + auto expected = makeNullableFlatVector({std::nullopt, 2, 3}); + auto input = + makeFlatVector({R"([])", R"({"a": 2})", R"({"a": 3})"}); + testFromJson(input, makeRowVector({"a"}, {expected})); +} + +TEST_F(FromJsonTest, structEmptyStruct) { + auto expected = makeNullableFlatVector({std::nullopt, 2, 3}); + auto input = + makeFlatVector({R"({ })", R"({"a": 2})", R"({"a": 3})"}); + testFromJson(input, makeRowVector({"a"}, {expected})); +} + +TEST_F(FromJsonTest, structWrongSchema) { + auto expected = makeNullableFlatVector({std::nullopt, 2, 3}); + auto input = makeFlatVector( + {R"({"b": 2})", R"({"a": 2})", R"({"a": 3})"}); + testFromJson(input, makeRowVector({"a"}, {expected})); +} + +TEST_F(FromJsonTest, structWrongData) { + auto expected = makeNullableFlatVector({std::nullopt, 2, 3}); + auto input = makeFlatVector( + {R"({"a": 2.1})", R"({"a": 2})", R"({"a": 3})"}); + testFromJson(input, makeRowVector({"a"}, {expected})); +} + +TEST_F(FromJsonTest, invalidType) { + auto primitiveTypeOutput = makeFlatVector({2, 2, 3}); + auto dateTypeOutput = makeFlatVector({2, 2, 3}, DATE()); + auto decimalOutput = makeFlatVector({2, 2, 3}, DECIMAL(16, 7)); + auto mapOutput = + makeMapVector({{{1, 1}}, {{2, 2}}, {{3, 3}}}); + auto input = makeFlatVector({R"(2)", R"({2)", R"({3)"}); + VELOX_ASSERT_USER_THROW( + testFromJson(input, primitiveTypeOutput), "Unsupported type BIGINT."); + VELOX_ASSERT_USER_THROW( + testFromJson(input, makeRowVector({"a"}, {dateTypeOutput})), + "Unsupported type ROW."); + VELOX_ASSERT_USER_THROW( + testFromJson(input, makeRowVector({"a"}, {decimalOutput})), + "Unsupported type ROW"); + VELOX_ASSERT_USER_THROW( + testFromJson(input, mapOutput), "Unsupported type MAP."); +} + +} // namespace +} // namespace facebook::velox::functions::sparksql::test