diff --git a/upb/message/BUILD b/upb/message/BUILD index 1b10994878f8..426f27b50f80 100644 --- a/upb/message/BUILD +++ b/upb/message/BUILD @@ -327,7 +327,12 @@ proto_library( name = "utf8_test_proto", testonly = 1, srcs = ["utf8_test.proto"], - deps = ["//src/google/protobuf:test_messages_proto3_proto"], +) + +proto_library( + name = "utf8_test_proto2_proto", + testonly = 1, + srcs = ["utf8_test_proto2.proto"], ) upb_minitable_proto_library( @@ -336,16 +341,30 @@ upb_minitable_proto_library( deps = [":utf8_test_proto"], ) +upb_minitable_proto_library( + name = "utf8_test_proto2_upb_minitable_proto", + testonly = 1, + deps = [":utf8_test_proto2_proto"], +) + upb_c_proto_library( name = "utf8_test_upb_proto", testonly = 1, deps = [":utf8_test_proto"], ) +upb_c_proto_library( + name = "utf8_test_proto2_upb_proto", + testonly = 1, + deps = [":utf8_test_proto2_proto"], +) + cc_test( name = "utf8_test", srcs = ["utf8_test.cc"], deps = [ + ":utf8_test_proto2_upb_minitable_proto", + ":utf8_test_proto2_upb_proto", ":utf8_test_upb_minitable_proto", ":utf8_test_upb_proto", "//upb:base", diff --git a/upb/message/utf8_test.cc b/upb/message/utf8_test.cc index b900a954b3a9..cd7dd1737135 100644 --- a/upb/message/utf8_test.cc +++ b/upb/message/utf8_test.cc @@ -14,6 +14,8 @@ #include "upb/mem/arena.hpp" #include "upb/message/utf8_test.upb.h" #include "upb/message/utf8_test.upb_minitable.h" +#include "upb/message/utf8_test_proto2.upb.h" +#include "upb/message/utf8_test_proto2.upb_minitable.h" #include "upb/wire/decode.h" namespace { @@ -72,6 +74,100 @@ TEST(Utf8Test, RepeatedProto3FieldValidates) { ASSERT_EQ(kUpb_DecodeStatus_BadUtf8, status); } +TEST(Utf8Test, Proto2BytesValidates) { + upb::Arena arena; + size_t size; + char* data = GetBadUtf8Payload(arena.ptr(), &size); + + upb_test_TestUtf8Proto2Bytes* msg = + upb_test_TestUtf8Proto2Bytes_new(arena.ptr()); + + upb_DecodeStatus status; + status = upb_Decode(data, size, UPB_UPCAST(msg), + &upb_0test__TestUtf8Proto2Bytes_msg_init, nullptr, 0, + arena.ptr()); + + // Parse succeeds, because proto2 bytes fields don't validate UTF-8. + ASSERT_EQ(kUpb_DecodeStatus_Ok, status); +} + +TEST(Utf8Test, Proto2RepeatedBytesValidates) { + upb::Arena arena; + size_t size; + char* data = GetBadUtf8Payload(arena.ptr(), &size); + + upb_test_TestUtf8RepeatedProto2Bytes* msg = + upb_test_TestUtf8RepeatedProto2Bytes_new(arena.ptr()); + + upb_DecodeStatus status; + status = upb_Decode(data, size, UPB_UPCAST(msg), + &upb_0test__TestUtf8RepeatedProto2Bytes_msg_init, nullptr, + 0, arena.ptr()); + + // Parse succeeds, because proto2 bytes fields don't validate UTF-8. + ASSERT_EQ(kUpb_DecodeStatus_Ok, status); +} + +TEST(Utf8Test, Proto2StringValidates) { + upb::Arena arena; + size_t size; + char* data = GetBadUtf8Payload(arena.ptr(), &size); + + upb_test_TestUtf8Proto2String* msg = + upb_test_TestUtf8Proto2String_new(arena.ptr()); + + upb_DecodeStatus status; + status = upb_Decode(data, size, UPB_UPCAST(msg), + &upb_0test__TestUtf8Proto2String_msg_init, nullptr, 0, + arena.ptr()); + + // Parse succeeds, because proto2 string fields don't validate UTF-8. + ASSERT_EQ(kUpb_DecodeStatus_Ok, status); +} + +TEST(Utf8Test, Proto2FieldFailsValidation) { + upb::Arena arena; + size_t size; + char* data = GetBadUtf8Payload(arena.ptr(), &size); + + upb_test_TestUtf8Proto2String* msg = + upb_test_TestUtf8Proto2String_new(arena.ptr()); + + upb_DecodeStatus status; + status = upb_Decode(data, size, UPB_UPCAST(msg), + &upb_0test__TestUtf8Proto2String_msg_init, nullptr, 0, + arena.ptr()); + + // Parse fails, because we pass in kUpb_DecodeOption_AlwaysValidateUtf8 to + // force validation of proto2 string fields. + status = upb_Decode(data, size, UPB_UPCAST(msg), + &upb_0test__TestUtf8Proto2String_msg_init, nullptr, + kUpb_DecodeOption_AlwaysValidateUtf8, arena.ptr()); + ASSERT_EQ(kUpb_DecodeStatus_BadUtf8, status); +} + +TEST(Utf8Test, Proto2RepeatedFieldFailsValidation) { + upb::Arena arena; + size_t size; + char* data = GetBadUtf8Payload(arena.ptr(), &size); + + upb_test_TestUtf8RepeatedProto2String* msg = + upb_test_TestUtf8RepeatedProto2String_new(arena.ptr()); + + upb_DecodeStatus status; + status = upb_Decode(data, size, UPB_UPCAST(msg), + &upb_0test__TestUtf8RepeatedProto2String_msg_init, + nullptr, 0, arena.ptr()); + + // Parse fails, because we pass in kUpb_DecodeOption_AlwaysValidateUtf8 to + // force validation of proto2 string fields. + status = + upb_Decode(data, size, UPB_UPCAST(msg), + &upb_0test__TestUtf8RepeatedProto2String_msg_init, nullptr, + kUpb_DecodeOption_AlwaysValidateUtf8, arena.ptr()); + ASSERT_EQ(kUpb_DecodeStatus_BadUtf8, status); +} + // begin:google_only // TEST(Utf8Test, Proto3MixedFieldValidates) { // upb::Arena arena; diff --git a/upb/message/utf8_test.proto b/upb/message/utf8_test.proto index 4070ac7d1968..515e3bf7d3af 100644 --- a/upb/message/utf8_test.proto +++ b/upb/message/utf8_test.proto @@ -35,7 +35,7 @@ message TestUtf8Proto3StringEnforceUtf8False { } message TestUtf8RepeatedProto3StringEnforceUtf8False { - optional string data = 1; + repeated string data = 1; } message TestUtf8Proto3StringEnforceUtf8FalseMixed { diff --git a/upb/message/utf8_test_proto2.proto b/upb/message/utf8_test_proto2.proto new file mode 100644 index 000000000000..6adc1055cd74 --- /dev/null +++ b/upb/message/utf8_test_proto2.proto @@ -0,0 +1,26 @@ +// Protocol Buffers - Google's data interchange format +// Copyright 2023 Google LLC. All rights reserved. +// +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file or at +// https://developers.google.com/open-source/licenses/bsd + +syntax = "proto2"; + +package upb_test; + +message TestUtf8Proto2Bytes { + optional bytes data = 1; +} + +message TestUtf8RepeatedProto2Bytes { + optional bytes data = 1; +} + +message TestUtf8Proto2String { + optional string data = 1; +} + +message TestUtf8RepeatedProto2String { + repeated string data = 1; +} diff --git a/upb/wire/decode.c b/upb/wire/decode.c index a80f65c11c9e..7318648be511 100644 --- a/upb/wire/decode.c +++ b/upb/wire/decode.c @@ -1005,18 +1005,24 @@ int _upb_Decoder_GetVarintOp(const upb_MiniTableField* field) { return kVarintOps[field->UPB_PRIVATE(descriptortype)]; } -UPB_FORCEINLINE -static void _upb_Decoder_CheckUnlinked(upb_Decoder* d, const upb_MiniTable* mt, - const upb_MiniTableField* field, - int* op) { - // If sub-message is not linked, treat as unknown. - if (field->UPB_PRIVATE(mode) & kUpb_LabelFlags_IsExtension) return; +// This returns a DecodeOp enum (kUpb_DecodeOp_SubMessage, +// kUpb_DecodeOp_UnknownField, etc). +static int _upb_Decoder_CheckUnlinked(upb_Decoder* d, const upb_MiniTable* mt, + const upb_MiniTableField* field) { + if (field->UPB_PRIVATE(mode) & kUpb_LabelFlags_IsExtension) { + return kUpb_DecodeOp_SubMessage; + } + const upb_MiniTable* mt_sub = _upb_MiniTableSubs_MessageByField(mt->UPB_PRIVATE(subs), field); - if ((d->options & kUpb_DecodeOption_ExperimentalAllowUnlinked) || - !UPB_PRIVATE(_upb_MiniTable_IsEmpty)(mt_sub)) { - return; + if (!UPB_PRIVATE(_upb_MiniTable_IsEmpty)(mt_sub)) { + return kUpb_DecodeOp_SubMessage; } + + if (d->options & kUpb_DecodeOption_ExperimentalAllowUnlinked) { + return kUpb_DecodeOp_SubMessage; + } + #ifndef NDEBUG const upb_MiniTableField* oneof = upb_MiniTable_GetOneof(mt, field); if (oneof) { @@ -1030,13 +1036,26 @@ static void _upb_Decoder_CheckUnlinked(upb_Decoder* d, const upb_MiniTable* mt, } while (upb_MiniTable_NextOneofField(mt, &oneof)); } #endif // NDEBUG - *op = kUpb_DecodeOp_UnknownField; + + return kUpb_DecodeOp_UnknownField; +} + +static int _upb_Decoder_MaybeVerifyUtf8(upb_Decoder* d, + const upb_MiniTableField* field) { + if (!(field->UPB_ONLYBITS(mode) & kUpb_LabelFlags_IsAlternate) || + !UPB_UNLIKELY(d->options & kUpb_DecodeOption_AlwaysValidateUtf8)) + return kUpb_DecodeOp_Bytes; + + return kUpb_DecodeOp_String; } -int _upb_Decoder_GetDelimitedOp(upb_Decoder* d, const upb_MiniTable* mt, - const upb_MiniTableField* field) { +static int _upb_Decoder_GetDelimitedOp(upb_Decoder* d, const upb_MiniTable* mt, + const upb_MiniTableField* field) { enum { kRepeatedBase = 19 }; + // This table is used to map field types to decode ops. However, not all the + // field types here may be used, since specialized logic in this function may + // determine the decode op based on more than just the field type. static const int8_t kDelimitedOps[] = { /* For non-repeated field type. */ [kUpb_FakeFieldType_FieldNotFound] = @@ -1083,15 +1102,17 @@ int _upb_Decoder_GetDelimitedOp(upb_Decoder* d, const upb_MiniTable* mt, // repeated msgset type }; - int ndx = field->UPB_PRIVATE(descriptortype); - if (upb_MiniTableField_IsArray(field)) ndx += kRepeatedBase; - int op = kDelimitedOps[ndx]; + const int ndx = field->UPB_PRIVATE(descriptortype); + const bool is_array = upb_MiniTableField_IsArray(field); - if (op == kUpb_DecodeOp_SubMessage) { - _upb_Decoder_CheckUnlinked(d, mt, field, &op); + if (ndx == kUpb_FieldType_Message || + (ndx == kUpb_FieldType_Group && is_array)) { + return _upb_Decoder_CheckUnlinked(d, mt, field); + } else if (ndx == kUpb_FieldType_Bytes) { + return _upb_Decoder_MaybeVerifyUtf8(d, field); + } else { + return kDelimitedOps[ndx + (is_array ? kRepeatedBase : 0)]; } - - return op; } UPB_FORCEINLINE @@ -1133,8 +1154,7 @@ static const char* _upb_Decoder_DecodeWireValue(upb_Decoder* d, const char* ptr, case kUpb_WireType_StartGroup: val->uint32_val = field->UPB_PRIVATE(number); if (field->UPB_PRIVATE(descriptortype) == kUpb_FieldType_Group) { - *op = kUpb_DecodeOp_SubMessage; - _upb_Decoder_CheckUnlinked(d, mt, field, op); + *op = _upb_Decoder_CheckUnlinked(d, mt, field); } else if (field->UPB_PRIVATE(descriptortype) == kUpb_FakeFieldType_MessageSetItem) { *op = kUpb_DecodeOp_MessageSetItem; diff --git a/upb/wire/decode.h b/upb/wire/decode.h index 7f2eafddca8f..68bf63f35669 100644 --- a/upb/wire/decode.h +++ b/upb/wire/decode.h @@ -83,6 +83,16 @@ enum { * be created by the parser or the message-copying logic in message/copy.h. */ kUpb_DecodeOption_ExperimentalAllowUnlinked = 4, + + /* EXPERIMENTAL: + * + * If set, decoding will enforce UTF-8 validation for string fields, even for + * proto2 or fields with `features.utf8_validation = NONE`. Normally, only + * proto3 string fields will be validated for UTF-8. Decoding will return + * kUpb_DecodeStatus_BadUtf8 for non-UTF-8 strings, which is the same behavior + * as non-UTF-8 proto3 string fields. + */ + kUpb_DecodeOption_AlwaysValidateUtf8 = 8, }; UPB_INLINE uint32_t upb_DecodeOptions_MaxDepth(uint16_t depth) {