-
Notifications
You must be signed in to change notification settings - Fork 1.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AVRO-4026: [c++] Add new methods to CustomAttributes to allow non-string custom attributes in schemas #3266
base: main
Are you sure you want to change the base?
Changes from 5 commits
ab67c99
07a2f4d
6740ac4
3820153
005e082
2fd25e1
0f0efd2
e95f691
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -92,7 +92,11 @@ Entity loadEntity(const char *text) { | |
Entity loadEntity(InputStream &in) { | ||
JsonParser p; | ||
p.init(in); | ||
return readEntity(p); | ||
Entity e = readEntity(p); | ||
if (p.hasMore()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not checking it in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just from the names and signatures, it looked like But for this method, the caller can't safely do anything with the input after it's called, so seemed more intuitive that it should enforce that the entire stream is consumed. None of these have doc comments, so it wasn't clear the intent everywhere, so I left things flexible and only changed this function, which appears safe to change without breaking any caller assumptions. |
||
throw TooManyValuesException(); | ||
} | ||
return e; | ||
} | ||
|
||
Entity loadEntity(const uint8_t *text, size_t len) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -177,6 +177,12 @@ AVRO_DECL Entity loadEntity(const uint8_t *text, size_t len); | |
|
||
void writeEntity(JsonGenerator<JsonNullFormatter> &g, const Entity &n); | ||
|
||
class AVRO_DECL TooManyValuesException : public virtual std::runtime_error { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see any other exception is defined except the one in the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, we catch this type of exception. If we just threw |
||
public: | ||
explicit TooManyValuesException() : | ||
std::runtime_error("invalid JSON document: expecting a single JSON value but found more than one") {} | ||
}; | ||
|
||
} // namespace json | ||
} // namespace avro | ||
|
||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -49,6 +49,28 @@ char JsonParser::next() { | |||||
return ch; | ||||||
} | ||||||
|
||||||
bool JsonParser::hasMore() { | ||||||
if (peeked || hasNext && !isspace(nextChar)) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
return true; | ||||||
} | ||||||
if (!hasNext) { | ||||||
nextChar = ' '; | ||||||
} | ||||||
// We return true if there are any more tokens; we ignore | ||||||
// any trailing whitespace. | ||||||
while (isspace(nextChar)) { | ||||||
if (nextChar == '\n') { | ||||||
line_++; | ||||||
} | ||||||
if (!in_.hasMore()) { | ||||||
return false; | ||||||
} | ||||||
nextChar = in_.read(); | ||||||
} | ||||||
hasNext = true; | ||||||
return true; | ||||||
} | ||||||
|
||||||
void JsonParser::expectToken(Token tk) { | ||||||
if (advance() != tk) { | ||||||
if (tk == Token::Double) { | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -116,6 +116,8 @@ public: | |
return curToken; | ||
} | ||
|
||
bool hasMore(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it possible not to add |
||
|
||
void expectToken(Token tk); | ||
|
||
bool boolValue() const { | ||
|
Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -19,27 +19,76 @@ | |||||||||||||||||||||||||||||||||||||
#ifndef avro_CustomAttributes_hh__ | ||||||||||||||||||||||||||||||||||||||
#define avro_CustomAttributes_hh__ | ||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||
#include "Config.hh" | ||||||||||||||||||||||||||||||||||||||
#include <iostream> | ||||||||||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: is it possible to remove this line? It looks weird that a public header exposes iostream. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think so. One of the methods of
|
||||||||||||||||||||||||||||||||||||||
#include <map> | ||||||||||||||||||||||||||||||||||||||
#include <optional> | ||||||||||||||||||||||||||||||||||||||
#include <string> | ||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||
#include "Config.hh" | ||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||
namespace avro { | ||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||
// CustomAttributes class stores avro custom attributes. | ||||||||||||||||||||||||||||||||||||||
// Each attribute is represented by a unique name and value. | ||||||||||||||||||||||||||||||||||||||
// User is supposed to create CustomAttributes object and then add it to Schema. | ||||||||||||||||||||||||||||||||||||||
class AVRO_DECL CustomAttributes { | ||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||
public: | ||||||||||||||||||||||||||||||||||||||
// Retrieves the custom attribute json entity for that attributeName, returns an | ||||||||||||||||||||||||||||||||||||||
// null if the attribute doesn't exist. | ||||||||||||||||||||||||||||||||||||||
enum ValueMode { | ||||||||||||||||||||||||||||||||||||||
// When a CustomAttributes is created using this mode, all values are strings. | ||||||||||||||||||||||||||||||||||||||
// The value should not be quoted, but any interior quotes and special | ||||||||||||||||||||||||||||||||||||||
// characters (such as newlines) must be escaped. | ||||||||||||||||||||||||||||||||||||||
string, | ||||||||||||||||||||||||||||||||||||||
// When a CustomAttributes is created using this mode, all values are JSON | ||||||||||||||||||||||||||||||||||||||
// values. String values must be quoted and escaped. | ||||||||||||||||||||||||||||||||||||||
json | ||||||||||||||||||||||||||||||||||||||
}; | ||||||||||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
I found that the styles of enum are not consistent. Personally I prefer upper-case. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As the docstring is user-facing, is it better to separate the expected behavior of write and read path? In a more complicated case, what is the expected behavior if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This isn't possible. The There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn't There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The value mode simply determines how this We don't need to know the value mode used to serialize a JSON schema. The JSON string is known to be valid JSON at this point., since the compiler will have already parsed it into an entity. It may have only string custom attributes or it might not. If it was serialized using "json" value mode, we can still process it with "string" value mode, as long as all custom attributes are strings. And we can always process any input in "json" value mode (the new non-deprecated mode). The current logic (prior to this PR) cannot handle custom attributes unless they are strings. While this library cannot currently generate schemas that have non-string custom attributes (at least as of v1.11.2), all other Avro libraries can generate that sort of file. That's the bug we're trying to fix in this PR: this library currently throws an exception if an Avro file has a schema with non-string custom attributes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the explanation! It sounds good to me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated the enum to use |
||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||
// Creates a new CustomAttributes object where all values are strings. | ||||||||||||||||||||||||||||||||||||||
// All values passed to addAttribute() and returned from getAttribute() or the | ||||||||||||||||||||||||||||||||||||||
// attributes() map will not be enclosed in quotes. However, any internal quotes | ||||||||||||||||||||||||||||||||||||||
// WILL be escaped and other special characters MAY be escaped. | ||||||||||||||||||||||||||||||||||||||
// | ||||||||||||||||||||||||||||||||||||||
// To support non-string values, use CustomAttributes(CustomAttributes::json) instead. | ||||||||||||||||||||||||||||||||||||||
CustomAttributes() : CustomAttributes(string) {} | ||||||||||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know the default value is to keep backward-compatibility. However, we have to swap the default value in the future because the current behavior is wrong for non-string values. (I'm not well-versed in this codebase yet) Is it possible to figure out if the avro file is created by avro-cpp as well as its writer version? If yes, perhaps we can choose the correct default There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We always use This is here for backwards compatibility of users of this API since this is a public type defined in a public header. So even though internal usages of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, that makes sense. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we mark it as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that's a very good idea. We might even want to deprecate the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Deprecated these elements in 0f0efd2. I am testing using clang on OS X. I don't see any other deprecations in this C++ codebase, so I can't find any other examples of muting the compiler warnings that result from using the deprecated elements in the implementation and the tests. Let me know how I should handle this. For now, I've added clang-specific |
||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||
// Creates a new CustomAttributes object. | ||||||||||||||||||||||||||||||||||||||
// | ||||||||||||||||||||||||||||||||||||||
// If the given mode is string, all values must be strings. All values passed to | ||||||||||||||||||||||||||||||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Trimmed away redundant comments in 0f0efd2. |
||||||||||||||||||||||||||||||||||||||
// addAttribute() and returned from getAttribute() or the attributes() map will not | ||||||||||||||||||||||||||||||||||||||
// be enclosed in quotes. However, any internal quotes and newlines WILL be escaped | ||||||||||||||||||||||||||||||||||||||
// and other special characters MAY be escaped. | ||||||||||||||||||||||||||||||||||||||
// | ||||||||||||||||||||||||||||||||||||||
// If the given mode is json, the values support any valid JSON type. In this more, | ||||||||||||||||||||||||||||||||||||||
// all values passed to addAttribute() and returned from getAttribute() or the | ||||||||||||||||||||||||||||||||||||||
// attributes() map must be valid JSON values; string values must be quoted and | ||||||||||||||||||||||||||||||||||||||
// escaped. | ||||||||||||||||||||||||||||||||||||||
CustomAttributes(ValueMode valueMode); | ||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||
// Retrieves the custom attribute string for the given name. Returns an empty | ||||||||||||||||||||||||||||||||||||||
// value if the attribute doesn't exist. | ||||||||||||||||||||||||||||||||||||||
// | ||||||||||||||||||||||||||||||||||||||
// If this CustomAttributes was created in json mode, the returned string will | ||||||||||||||||||||||||||||||||||||||
// be a JSON document (and string values will be quoted and escaped). Otherwise, | ||||||||||||||||||||||||||||||||||||||
// the returned string will be a string value, though interior quotes and newlines | ||||||||||||||||||||||||||||||||||||||
// will be escaped and other special characters may be escaped. | ||||||||||||||||||||||||||||||||||||||
std::optional<std::string> getAttribute(const std::string &name) const; | ||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||
// Adds a custom attribute. If the attribute already exists, throw an exception. | ||||||||||||||||||||||||||||||||||||||
// Adds a custom attribute. | ||||||||||||||||||||||||||||||||||||||
// | ||||||||||||||||||||||||||||||||||||||
// If this CustomAttributes was create in json mode, the given value string must | ||||||||||||||||||||||||||||||||||||||
// be a valid JSON document. So if the value is a string, it must be quoted and | ||||||||||||||||||||||||||||||||||||||
// escaped. Otherwise, the given value string is an unquoted string value (though | ||||||||||||||||||||||||||||||||||||||
// interior quotes and newlines must still be escaped). | ||||||||||||||||||||||||||||||||||||||
// | ||||||||||||||||||||||||||||||||||||||
// If the attribute already exists or if the given value is not a valid JSON | ||||||||||||||||||||||||||||||||||||||
// document or not a correctly escaped string, throw an exception. | ||||||||||||||||||||||||||||||||||||||
void addAttribute(const std::string &name, const std::string &value); | ||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||
// Provides a way to iterate over the custom attributes or check attribute size. | ||||||||||||||||||||||||||||||||||||||
// The values in this map are the same as those returned from getAttribute. So | ||||||||||||||||||||||||||||||||||||||
// the value may be a JSON document or an unquoted string, depending on whether | ||||||||||||||||||||||||||||||||||||||
// this CustomAttributes was created in json or string mode. | ||||||||||||||||||||||||||||||||||||||
const std::map<std::string, std::string> &attributes() const { | ||||||||||||||||||||||||||||||||||||||
return attributes_; | ||||||||||||||||||||||||||||||||||||||
} | ||||||||||||||||||||||||||||||||||||||
|
@@ -48,6 +97,7 @@ public: | |||||||||||||||||||||||||||||||||||||
void printJson(std::ostream &os, const std::string &name) const; | ||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||
private: | ||||||||||||||||||||||||||||||||||||||
ValueMode valueMode_; | ||||||||||||||||||||||||||||||||||||||
std::map<std::string, std::string> attributes_; | ||||||||||||||||||||||||||||||||||||||
}; | ||||||||||||||||||||||||||||||||||||||
|
||||||||||||||||||||||||||||||||||||||
|
Original file line number | Diff line number | Diff line change | |||||||||
---|---|---|---|---|---|---|---|---|---|---|---|
|
@@ -440,29 +440,31 @@ struct TestSchema { | ||||||||||
std::vector<GenericDatum> defaultValues; | |||||||||||
concepts::MultiAttribute<CustomAttributes> customAttributes; | |||||||||||
|
|||||||||||
CustomAttributes cf; | |||||||||||
cf.addAttribute("stringField", std::string("\\\"field value with \\\"double quotes\\\"\\\"")); | |||||||||||
cf.addAttribute("booleanField", std::string("true")); | |||||||||||
cf.addAttribute("numberField", std::string("1.23")); | |||||||||||
cf.addAttribute("nullField", std::string("null")); | |||||||||||
cf.addAttribute("arrayField", std::string("[1]")); | |||||||||||
cf.addAttribute("mapField", std::string("{\\\"key1\\\":\\\"value1\\\", \\\"key2\\\":\\\"value2\\\"}")); | |||||||||||
CustomAttributes ca(CustomAttributes::json); | |||||||||||
ca.addAttribute("stringField", std::string("\"foobar\"")); | |||||||||||
ca.addAttribute("stringFieldComplex", std::string("\"\\\" a field value with \\\"double quotes\\\" \\\"\"")); | |||||||||||
ca.addAttribute("booleanField", std::string("true")); | |||||||||||
ca.addAttribute("numberField", std::string("1.23")); | |||||||||||
ca.addAttribute("nullField", std::string("null")); | |||||||||||
ca.addAttribute("arrayField", std::string("[1]")); | |||||||||||
ca.addAttribute("mapField", std::string("{\"key1\":\"value1\", \"key2\":\"value2\"}")); | |||||||||||
fieldNames.add("f1"); | |||||||||||
fieldValues.add(NodePtr(new NodePrimitive(Type::AVRO_LONG))); | |||||||||||
customAttributes.add(cf); | |||||||||||
customAttributes.add(ca); | |||||||||||
|
|||||||||||
NodeRecord nodeRecordWithCustomAttribute(nameConcept, fieldValues, | |||||||||||
fieldNames, fieldAliases, defaultValues, | |||||||||||
customAttributes); | |||||||||||
std::string expectedJsonWithCustomAttribute = | |||||||||||
"{\"type\": \"record\", \"name\": \"Test\",\"fields\": " | |||||||||||
"[{\"name\": \"f1\", \"type\": \"long\", " | |||||||||||
"\"arrayField\": \"[1]\", " | |||||||||||
"\"booleanField\": \"true\", " | |||||||||||
"\"mapField\": \"{\\\"key1\\\":\\\"value1\\\", \\\"key2\\\":\\\"value2\\\"}\", " | |||||||||||
"\"nullField\": \"null\", " | |||||||||||
"\"numberField\": \"1.23\", " | |||||||||||
"\"stringField\": \"\\\"field value with \\\"double quotes\\\"\\\"\"" | |||||||||||
"\"arrayField\": [1], " | |||||||||||
"\"booleanField\": true, " | |||||||||||
"\"mapField\": {\"key1\":\"value1\", \"key2\":\"value2\"}, " | |||||||||||
"\"nullField\": null, " | |||||||||||
"\"numberField\": 1.23, " | |||||||||||
"\"stringField\": \"foobar\", " | |||||||||||
"\"stringFieldComplex\": \"\\\" a field value with \\\"double quotes\\\" \\\"\"" | |||||||||||
"}]}"; | |||||||||||
testNodeRecord(nodeRecordWithCustomAttribute, | |||||||||||
expectedJsonWithCustomAttribute); | |||||||||||
|
@@ -489,12 +491,26 @@ struct TestSchema { | ||||||||||
expectedJsonWithoutCustomAttribute); | |||||||||||
} | |||||||||||
|
|||||||||||
void checkCustomAttributes_getAttribute() { | |||||||||||
CustomAttributes cf; | |||||||||||
cf.addAttribute("field1", std::string("1")); | |||||||||||
void checkCustomAttributes_addAndGetAttributeJson() { | |||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. IMHO these cases are not sufficient. We need to cover the expected behavior of the following cases (even exceptions are expected in some of them):
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the only thing missing from the matrix was the failure tests for passing a "string" value mode style value when mode is actually "json" and vice versa. I updated the tests in e95f691, and they caught an issue: we were only handling |
|||||||||||
CustomAttributes ca(CustomAttributes::json); | |||||||||||
ca.addAttribute("field1", std::string("true")); | |||||||||||
|
|||||||||||
BOOST_CHECK_EQUAL(std::string("1"), *cf.getAttribute("field1")); | |||||||||||
BOOST_CHECK_EQUAL(false, cf.getAttribute("not_existing").has_value()); | |||||||||||
BOOST_CHECK_EQUAL(std::string("true"), *ca.getAttribute("field1")); | |||||||||||
BOOST_CHECK_EQUAL(false, ca.getAttribute("not_existing").has_value()); | |||||||||||
} | |||||||||||
|
|||||||||||
void checkCustomAttributes_addAndGetAttributeString() { | |||||||||||
CustomAttributes ca; | |||||||||||
ca.addAttribute("field1", std::string("true")); | |||||||||||
ca.addAttribute("field2", std::string("value with \\\"quotes\\\"")); | |||||||||||
|
|||||||||||
BOOST_CHECK_EQUAL(std::string("true"), *ca.getAttribute("field1")); | |||||||||||
BOOST_CHECK_EQUAL(std::string("value with \\\"quotes\\\""), *ca.getAttribute("field2")); | |||||||||||
BOOST_CHECK_EQUAL(false, ca.getAttribute("not_existing").has_value()); | |||||||||||
|
|||||||||||
std::ostringstream oss; | |||||||||||
ca.printJson(oss, "field2"); | |||||||||||
BOOST_CHECK_EQUAL(std::string("\"field2\": \"value with \\\"quotes\\\"\""), oss.str()); | |||||||||||
} | |||||||||||
|
|||||||||||
void test() { | |||||||||||
|
@@ -521,7 +537,8 @@ struct TestSchema { | ||||||||||
|
|||||||||||
checkNodeRecordWithoutCustomAttribute(); | |||||||||||
checkNodeRecordWithCustomAttribute(); | |||||||||||
checkCustomAttributes_getAttribute(); | |||||||||||
checkCustomAttributes_addAndGetAttributeJson(); | |||||||||||
checkCustomAttributes_addAndGetAttributeString(); | |||||||||||
} | |||||||||||
|
|||||||||||
ValidSchema schema_; | |||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we change this? Is it a breaking change to downstream users who are calling
stringValue
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, this is a method on
json::Entity
which I did not touch. This was the source of the exception: if there was a non-string value for a custom attribute, thestringValue
method would throw an exception. But now that we support all kinds of values, we use itstoString
method, which will return properly formatted JSON string to represent whatever is theEntity
's value.