Skip to content

Commit

Permalink
Fixed UTF-8 TextFormat output to protect against invalid UTF-8 in str…
Browse files Browse the repository at this point in the history
…ing fields.

This will guarantee that the output of TextFormat is always valid UTF-8.  Prior to this change, any invalid UTF-8 would escape into the TextFormat output if users enabled `SetUseUtf8StringEscaping(true)` or called `Utf8DebugString()`.

This change currently only affects users who explicitly set `SetUseUtf8StringEscaping(true)` or use `Utf8DebugString()`, but the hope is to flip the default so that this mode is enabled unless overridden.

PiperOrigin-RevId: 589844142
  • Loading branch information
haberman authored and copybara-github committed Dec 11, 2023
1 parent d605b48 commit 1ac8c04
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 2 deletions.
85 changes: 83 additions & 2 deletions src/google/protobuf/text_format.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <atomic>
#include <climits>
#include <cmath>
#include <cstddef>
#include <limits>
#include <string>
#include <utility>
Expand Down Expand Up @@ -47,6 +48,7 @@
#include "google/protobuf/repeated_field.h"
#include "google/protobuf/unknown_field_set.h"
#include "google/protobuf/wire_format_lite.h"
#include "utf8_validity.h"

// Must be included last.
#include "google/protobuf/port_def.inc"
Expand Down Expand Up @@ -1647,6 +1649,83 @@ class TextFormat::Printer::DebugStringFieldValuePrinter
}
};

namespace {

// Returns true if `ch` needs to be escaped in TextFormat, independent of any
// UTF-8 validity issues.
bool DefinitelyNeedsEscape(unsigned char ch) {
if (ch < 32) return true;
switch (ch) {
case '\"':
case '\'':
case '\\':
return true;
}
return false;
}

// Returns true if this is a high byte that requires UTF-8 validation. If the
// UTF-8 validation fails, we must escape the byte.
bool NeedsUtf8Validation(unsigned char ch) { return ch > 127; }

// Returns the number of bytes in the prefix of `val` that do not need escaping.
// This is like utf8_range::SpanStructurallyValid(), except that it also
// terminates at any ASCII char that needs to be escaped in TextFormat (any char
// that has `DefinitelyNeedsEscape(ch) == true`).
//
// If we could get a variant of utf8_range::SpanStructurallyValid() that could
// terminate on any of these chars, that might be more efficient, but it would
// be much more complicated to modify that heavily SIMD code.
size_t SkipPassthroughBytes(absl::string_view val) {
for (size_t i = 0; i < val.size(); i++) {
unsigned char uc = val[i];
if (DefinitelyNeedsEscape(uc)) return i;
if (NeedsUtf8Validation(uc)) {
// Find the end of this region of consecutive high bytes, so that we only
// give high bytes to the UTF-8 checker. This avoids needing to perform
// a second scan of the ASCII characters looking for characters that
// need escaping.
//
// We assume that high bytes are less frequent than plain, printable ASCII
// bytes, so we accept the double-scan of high bytes.
size_t end = i + 1;
for (; end < val.size(); end++) {
if (!NeedsUtf8Validation(val[end])) break;
}
size_t n = end - i;
size_t ok = utf8_range::SpanStructurallyValid(val.substr(i, n));
if (ok != n) return i + ok;
i += ok - 1;
}
}
return val.size();
}

void HardenedPrintString(absl::string_view src,
TextFormat::BaseTextGenerator* generator) {
// Print as UTF-8, while guarding against any invalid UTF-8 in the string
// field.
//
// If in the future we have a guaranteed invariant that invalid UTF-8 will
// never be present, we could avoid the UTF-8 check here.

while (!src.empty()) {
size_t n = SkipPassthroughBytes(src);
if (n != 0) {
generator->PrintString(src.substr(0, n));
src.remove_prefix(n);
if (src.empty()) break;
}

// If repeated calls to CEscape() and PrintString() are expensive, we could
// consider batching them, at the cost of some complexity.
generator->PrintString(absl::CEscape(src.substr(0, 1)));
src.remove_prefix(1);
}
}

} // namespace

// ===========================================================================
// An internal field value printer that escape UTF8 strings.
class TextFormat::Printer::FastFieldValuePrinterUtf8Escaping
Expand All @@ -1655,7 +1734,7 @@ class TextFormat::Printer::FastFieldValuePrinterUtf8Escaping
void PrintString(const std::string& val,
TextFormat::BaseTextGenerator* generator) const override {
generator->PrintLiteral("\"");
generator->PrintString(absl::Utf8SafeCEscape(val));
HardenedPrintString(val, generator);
generator->PrintLiteral("\"");
}
void PrintBytes(const std::string& val,
Expand Down Expand Up @@ -1956,7 +2035,9 @@ void TextFormat::FastFieldValuePrinter::PrintEnum(
void TextFormat::FastFieldValuePrinter::PrintString(
const std::string& val, BaseTextGenerator* generator) const {
generator->PrintLiteral("\"");
generator->PrintString(absl::CEscape(val));
if (!val.empty()) {
generator->PrintString(absl::CEscape(val));
}
generator->PrintLiteral("\"");
}
void TextFormat::FastFieldValuePrinter::PrintBytes(
Expand Down
1 change: 1 addition & 0 deletions src/google/protobuf/text_format_unittest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
#include "google/protobuf/unittest_mset.pb.h"
#include "google/protobuf/unittest_mset_wire_format.pb.h"
#include "google/protobuf/unittest_proto3.pb.h"
#include "utf8_validity.h"


// Must be included last.
Expand Down

0 comments on commit 1ac8c04

Please sign in to comment.