From fd1433954f8344ffa433a42734e0dce1cd71deea Mon Sep 17 00:00:00 2001 From: Oscar Westra van Holthe - Kind Date: Fri, 1 Dec 2023 12:01:13 +0100 Subject: [PATCH] AVRO-3666: [JAVA] Separate parsing from Schema class (#2513) This allows using pluggable parser implementations, allowing multiple formats to be parsed with the same code. This includes the use of NameValidator and parsing multiple files with circular references between them. --- .editorconfig | 2 + .mvn/extensions.xml | 2 +- .../Getting started (Java)/_index.md | 6 +- .../apache/avro/FormattedSchemaParser.java | 77 ++++ .../org/apache/avro/JsonSchemaParser.java | 82 ++++ .../java/org/apache/avro/NameValidator.java | 104 +++++ .../java/org/apache/avro/ParseContext.java | 234 +++++++++++ .../src/main/java/org/apache/avro/Schema.java | 99 +---- .../java/org/apache/avro/SchemaParser.java | 249 ++++++++++++ .../org/apache/avro/file/DataFileStream.java | 3 +- .../org/apache/avro/util/SchemaResolver.java | 367 ++++++++++++++++++ .../org/apache/avro/util}/SchemaVisitor.java | 24 +- .../java/org/apache/avro/util}/Schemas.java | 13 +- .../org/apache/avro/util/UtfTextUtils.java | 247 ++++++++++++ .../org/apache/avro/DummySchemaParser.java | 53 +++ .../org/apache/avro/ParseContextTest.java | 124 ++++++ .../apache/avro/SchemaNameValidatorTest.java | 44 +-- .../org/apache/avro/TestDataFileReader.java | 6 +- .../test/java/org/apache/avro/TestSchema.java | 6 +- .../org/apache/avro/TestSchemaBuilder.java | 2 +- .../org/apache/avro/TestSchemaParser.java | 107 +++++ .../org/apache/avro/reflect/TestReflect.java | 5 +- .../apache/avro/util}/TestSchemaResolver.java | 41 +- .../org/apache/avro/util}/TestSchemas.java | 20 +- .../apache/avro/util/UtfTextUtilsTest.java | 132 +++++++ .../org.apache.avro.FormattedSchemaParser | 18 + lang/java/build.sh | 3 +- lang/java/idl/pom.xml | 1 + .../java/org/apache/avro/idl/IdlFile.java | 27 +- .../java/org/apache/avro/idl/IdlReader.java | 133 ++++--- .../org/apache/avro/idl/IdlSchemaParser.java | 45 +++ .../avro/idl/IsResolvedSchemaVisitor.java | 60 --- .../org/apache/avro/idl/ResolvingVisitor.java | 192 --------- .../org/apache/avro/idl/SchemaResolver.java | 149 ------- .../apache/avro/idl/SchemaVisitorAction.java | 40 -- .../org.apache.avro.FormattedSchemaParser | 18 + .../org/apache/avro/idl/IdlReaderTest.java | 4 +- .../java/org/apache/avro/idl/TestCycle.java | 11 +- .../test/java/org/apache/avro/TestSchema.java | 6 +- .../java/org/apache/avro/mojo/IDLMojo.java | 16 +- lang/java/pom.xml | 2 + .../java/org/apache/avro/tool/IdlTool.java | 8 +- 42 files changed, 2088 insertions(+), 694 deletions(-) create mode 100644 lang/java/avro/src/main/java/org/apache/avro/FormattedSchemaParser.java create mode 100644 lang/java/avro/src/main/java/org/apache/avro/JsonSchemaParser.java create mode 100644 lang/java/avro/src/main/java/org/apache/avro/NameValidator.java create mode 100644 lang/java/avro/src/main/java/org/apache/avro/ParseContext.java create mode 100644 lang/java/avro/src/main/java/org/apache/avro/SchemaParser.java create mode 100644 lang/java/avro/src/main/java/org/apache/avro/util/SchemaResolver.java rename lang/java/{idl/src/main/java/org/apache/avro/idl => avro/src/main/java/org/apache/avro/util}/SchemaVisitor.java (77%) rename lang/java/{idl/src/main/java/org/apache/avro/idl => avro/src/main/java/org/apache/avro/util}/Schemas.java (89%) create mode 100644 lang/java/avro/src/main/java/org/apache/avro/util/UtfTextUtils.java create mode 100644 lang/java/avro/src/test/java/org/apache/avro/DummySchemaParser.java create mode 100644 lang/java/avro/src/test/java/org/apache/avro/ParseContextTest.java create mode 100644 lang/java/avro/src/test/java/org/apache/avro/TestSchemaParser.java rename lang/java/{idl/src/test/java/org/apache/avro/idl => avro/src/test/java/org/apache/avro/util}/TestSchemaResolver.java (62%) rename lang/java/{idl/src/test/java/org/apache/avro/idl => avro/src/test/java/org/apache/avro/util}/TestSchemas.java (90%) create mode 100644 lang/java/avro/src/test/java/org/apache/avro/util/UtfTextUtilsTest.java create mode 100644 lang/java/avro/src/test/resources/META-INF/services/org.apache.avro.FormattedSchemaParser create mode 100644 lang/java/idl/src/main/java/org/apache/avro/idl/IdlSchemaParser.java delete mode 100644 lang/java/idl/src/main/java/org/apache/avro/idl/IsResolvedSchemaVisitor.java delete mode 100644 lang/java/idl/src/main/java/org/apache/avro/idl/ResolvingVisitor.java delete mode 100644 lang/java/idl/src/main/java/org/apache/avro/idl/SchemaResolver.java delete mode 100644 lang/java/idl/src/main/java/org/apache/avro/idl/SchemaVisitorAction.java create mode 100644 lang/java/idl/src/main/resources/META-INF/services/org.apache.avro.FormattedSchemaParser diff --git a/.editorconfig b/.editorconfig index a2a93880be0..b96e2b9c6e8 100644 --- a/.editorconfig +++ b/.editorconfig @@ -19,6 +19,8 @@ root = true charset = utf-8 end_of_line = lf insert_final_newline = true +ij_any_block_comment_at_first_column = false +ij_any_line_comment_at_first_column = false [*.{java,xml,sh}] indent_style = space diff --git a/.mvn/extensions.xml b/.mvn/extensions.xml index c6000969d38..e2e84018d96 100644 --- a/.mvn/extensions.xml +++ b/.mvn/extensions.xml @@ -20,6 +20,6 @@ org.apache.maven.extensions maven-build-cache-extension - 1.0.0 + 1.0.1 diff --git a/doc/content/en/docs/++version++/Getting started (Java)/_index.md b/doc/content/en/docs/++version++/Getting started (Java)/_index.md index 2eae94b2925..2d964c9c16c 100644 --- a/doc/content/en/docs/++version++/Getting started (Java)/_index.md +++ b/doc/content/en/docs/++version++/Getting started (Java)/_index.md @@ -77,7 +77,7 @@ You may also build the required Avro jars from source. Building Avro is beyond t ## Defining a schema -Avro schemas are defined using JSON. Schemas are composed of primitive types (null, boolean, int, long, float, double, bytes, and string) and complex types (record, enum, array, map, union, and fixed). You can learn more about Avro schemas and types from the specification, but for now let's start with a simple schema example, user.avsc: +Avro schemas are defined using JSON or IDL (the latter requires an extra dependency). Schemas are composed of primitive types (null, boolean, int, long, float, double, bytes, and string) and complex types (record, enum, array, map, union, and fixed). You can learn more about Avro schemas and types from the specification, but for now let's start with a simple schema example, user.avsc: ```json {"namespace": "example.avro", @@ -209,10 +209,10 @@ Data in Avro is always stored with its corresponding schema, meaning we can alwa Let's go over the same example as in the previous section, but without using code generation: we'll create some users, serialize them to a data file on disk, and then read back the file and deserialize the users objects. ### Creating users -First, we use a Parser to read our schema definition and create a Schema object. +First, we use a SchemaParser to read our schema definition and create a Schema object. ```java -Schema schema = new Schema.Parser().parse(new File("user.avsc")); +Schema schema = new SchemaParser().parse(new File("user.avsc")); ``` Using this schema, let's create some users. diff --git a/lang/java/avro/src/main/java/org/apache/avro/FormattedSchemaParser.java b/lang/java/avro/src/main/java/org/apache/avro/FormattedSchemaParser.java new file mode 100644 index 00000000000..cd67788fa9e --- /dev/null +++ b/lang/java/avro/src/main/java/org/apache/avro/FormattedSchemaParser.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro; + +import java.io.IOException; +import java.net.URI; + +/** + * Schema parser for a specific schema format. + * + *

+ * The {@link SchemaParser} class uses this interface, supporting text based + * schema sources. + *

+ * + *

+ * Implementations are located using a {@link java.util.ServiceLoader} and must + * therefore be threadsafe. See the {@code ServiceLoader} class for details on + * loading your implementation. + *

+ * + * @see java.util.ServiceLoader + */ +public interface FormattedSchemaParser { + /** + *

+ * Parse schema definitions from a text based source. + *

+ * + *

Notes for implementers:

+ * + * + * + * @param parseContext the current parse context: all parsed schemata should + * be added here to resolve names with; contains all + * previously known types + * @param baseUri the base location of the schema, or {@code null} if + * not known + * @param formattedSchema the text of the schema definition(s) to parse + * @return the main schema, if any + * @throws IOException when the schema cannot be read + * @throws SchemaParseException when the schema cannot be parsed + */ + Schema parse(ParseContext parseContext, URI baseUri, CharSequence formattedSchema) + throws IOException, SchemaParseException; +} diff --git a/lang/java/avro/src/main/java/org/apache/avro/JsonSchemaParser.java b/lang/java/avro/src/main/java/org/apache/avro/JsonSchemaParser.java new file mode 100644 index 00000000000..c7d91878627 --- /dev/null +++ b/lang/java/avro/src/main/java/org/apache/avro/JsonSchemaParser.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro; + +import java.io.IOException; +import java.net.URI; + +/** + * Schema parser for JSON formatted schemata. This initial implementation simply + * delegates to the {@link Schema.Parser} class, though it should be refactored + * out of there. + * + *

+ * Note: this class is intentionally not available via the Java + * {@link java.util.ServiceLoader}, as its use is hardcoded as fallback when no + * service exists. This enables users to reliably override the standard JSON + * parser as well. + *

+ */ +public class JsonSchemaParser implements FormattedSchemaParser { + /** + *

+ * Parse a schema written in the internal (JSON) format without any validations. + *

+ * + *

+ * Using this method is only safe if used to parse a write schema (i.e., a + * schema used to read Avro data). Other usages, for example by generated Avro + * code, can cause interoperability problems. + *

+ * + *

+ * Use with care and sufficient testing! + *

+ * + * @param fragments one or more strings making up the schema (some schemata + * exceed the compiler limits) + * @return the parsed schema + */ + public static Schema parseInternal(String... fragments) { + StringBuilder buffer = new StringBuilder(); + for (String fragment : fragments) { + buffer.append(fragment); + } + return new JsonSchemaParser().parse(new ParseContext(NameValidator.NO_VALIDATION), buffer, null); + } + + @Override + public Schema parse(ParseContext parseContext, URI baseUri, CharSequence formattedSchema) + throws IOException, SchemaParseException { + return parse(parseContext, formattedSchema, parseContext.nameValidator); + } + + private Schema parse(ParseContext parseContext, CharSequence formattedSchema, NameValidator nameValidator) + throws SchemaParseException { + Schema.Parser parser = new Schema.Parser(nameValidator); + if (nameValidator == NameValidator.NO_VALIDATION) { + parser.setValidateDefaults(false); + } else { + parser = new Schema.Parser(nameValidator); + } + parser.addTypes(parseContext.typesByName().values()); + Schema schema = parser.parse(formattedSchema.toString()); + parser.getTypes().values().forEach(parseContext::put); + return schema; + } +} diff --git a/lang/java/avro/src/main/java/org/apache/avro/NameValidator.java b/lang/java/avro/src/main/java/org/apache/avro/NameValidator.java new file mode 100644 index 00000000000..f1262d922cf --- /dev/null +++ b/lang/java/avro/src/main/java/org/apache/avro/NameValidator.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro; + +public interface NameValidator { + + class Result { + private final String errors; + + public Result(final String errors) { + this.errors = errors; + } + + public boolean isOK() { + return this == NameValidator.OK; + } + + public String getErrors() { + return errors; + } + } + + Result OK = new Result(null); + + default Result validate(String name) { + return OK; + } + + NameValidator NO_VALIDATION = new NameValidator() { + }; + + NameValidator UTF_VALIDATOR = new NameValidator() { + @Override + public Result validate(final String name) { + if (name == null) { + return new Result("Null name"); + } + int length = name.length(); + if (length == 0) { + return new Result("Empty name"); + } + char first = name.charAt(0); + if (!(Character.isLetter(first) || first == '_')) { + return new Result("Illegal initial character: " + name); + } + for (int i = 1; i < length; i++) { + char c = name.charAt(i); + if (!(Character.isLetterOrDigit(c) || c == '_')) { + return new Result("Illegal character in: " + name); + } + } + return OK; + } + }; + + NameValidator STRICT_VALIDATOR = new NameValidator() { + @Override + public Result validate(final String name) { + if (name == null) { + return new Result("Null name"); + } + int length = name.length(); + if (length == 0) { + return new Result("Empty name"); + } + char first = name.charAt(0); + if (!(isLetter(first) || first == '_')) { + return new Result("Illegal initial character: " + name); + } + for (int i = 1; i < length; i++) { + char c = name.charAt(i); + if (!(isLetter(c) || isDigit(c) || c == '_')) { + return new Result("Illegal character in: " + name); + } + } + return OK; + } + + private boolean isLetter(char c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); + } + + private boolean isDigit(char c) { + return c >= '0' && c <= '9'; + } + + }; + +} diff --git a/lang/java/avro/src/main/java/org/apache/avro/ParseContext.java b/lang/java/avro/src/main/java/org/apache/avro/ParseContext.java new file mode 100644 index 00000000000..401c93e50c5 --- /dev/null +++ b/lang/java/avro/src/main/java/org/apache/avro/ParseContext.java @@ -0,0 +1,234 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro; + +import org.apache.avro.util.SchemaResolver; + +import java.util.EnumSet; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.Map; +import java.util.Set; + +/** + * Class to define a name context, useful to reference schemata with. This + * allows for the following: + * + * + * + *

+ * Note: this class has no use for most Avro users, but is a key component when + * implementing a schema parser. + *

+ * + * @see JSON based + * schema definition + **/ +public class ParseContext { + private static final Map PRIMITIVES = new HashMap<>(); + + static { + PRIMITIVES.put("string", Schema.Type.STRING); + PRIMITIVES.put("bytes", Schema.Type.BYTES); + PRIMITIVES.put("int", Schema.Type.INT); + PRIMITIVES.put("long", Schema.Type.LONG); + PRIMITIVES.put("float", Schema.Type.FLOAT); + PRIMITIVES.put("double", Schema.Type.DOUBLE); + PRIMITIVES.put("boolean", Schema.Type.BOOLEAN); + PRIMITIVES.put("null", Schema.Type.NULL); + } + + private static final Set NAMED_SCHEMA_TYPES = EnumSet.of(Schema.Type.RECORD, Schema.Type.ENUM, + Schema.Type.FIXED); + private final Map oldSchemas; + private final Map newSchemas; + // Visible for use in JsonSchemaParser + final NameValidator nameValidator; + + /** + * Create a {@code ParseContext} for the default/{@code null} namespace, using + * default name validation for new schemata. + */ + public ParseContext() { + this(NameValidator.UTF_VALIDATOR); + } + + /** + * Create a {@code ParseContext} using the specified name validation for new + * schemata. + */ + public ParseContext(NameValidator nameValidator) { + this(nameValidator, new LinkedHashMap<>(), new LinkedHashMap<>()); + } + + private ParseContext(NameValidator nameValidator, Map oldSchemas, Map newSchemas) { + this.nameValidator = nameValidator; + this.oldSchemas = oldSchemas; + this.newSchemas = newSchemas; + } + + /** + * Create a derived context using a different fallback namespace. + * + * @return a new context + */ + public ParseContext namespace() { + return new ParseContext(nameValidator, oldSchemas, newSchemas); + } + + /** + * Tell whether this context contains a schema with the given name. + * + * @param name a schema name + * @return {@code true} if the context contains a schema with this name, + * {@code false} otherwise + */ + public boolean contains(String name) { + return PRIMITIVES.containsKey(name) || oldSchemas.containsKey(name) || newSchemas.containsKey(name); + } + + /** + *

+ * Resolve a schema by name. + *

+ * + *

+ * That is: + *

+ * + *
    + *
  • If {@code fullName} is a primitive name, return a (new) schema for + * it
  • + *
  • Otherwise: resolve the schema in its own namespace and in the null + * namespace (the former takes precedence)
  • + *
+ * + * Resolving means that the schema is returned if known, and otherwise an + * unresolved schema (a reference) is returned. + * + * @param fullName the full schema name to resolve + * @return the schema + * @throws SchemaParseException when the schema does not exist + */ + public Schema resolve(String fullName) { + Schema.Type type = PRIMITIVES.get(fullName); + if (type != null) { + return Schema.create(type); + } + + Schema schema = getSchema(fullName); + if (schema == null) { + // Not found; attempt to resolve in the default namespace + int lastDot = fullName.lastIndexOf('.'); + String name = fullName.substring(lastDot + 1); + schema = getSchema(name); + } + + return schema != null ? schema : SchemaResolver.unresolvedSchema(fullName); + } + + private Schema getSchema(String fullName) { + Schema schema = oldSchemas.get(fullName); + if (schema == null) { + schema = newSchemas.get(fullName); + } + return schema; + } + + private boolean notEmpty(String str) { + return str != null && !str.isEmpty(); + } + + /** + * Put the schema into this context. This is an idempotent operation: it only + * fails if this context already has a different schema with the same name. + * + *

+ * Note that although this method works for all types except for arrays, maps + * and unions, all primitive types have already been defined upon construction. + * This means you cannot redefine a 'long' with a logical timestamp type. + *

+ * + * @param schema the schema to put into the context + */ + public void put(Schema schema) { + if (!(NAMED_SCHEMA_TYPES.contains(schema.getType()))) { + throw new AvroTypeException("You can only put a named schema into the context"); + } + + String fullName = requireValidFullName(schema.getFullName()); + + Schema alreadyKnownSchema = oldSchemas.get(fullName); + if (alreadyKnownSchema != null) { + if (!schema.equals(alreadyKnownSchema)) { + throw new SchemaParseException("Can't redefine: " + fullName); + } + } else { + Schema previouslyAddedSchema = newSchemas.putIfAbsent(fullName, schema); + if (previouslyAddedSchema != null && !previouslyAddedSchema.equals(schema)) { + throw new SchemaParseException("Can't redefine: " + fullName); + } + } + } + + private String requireValidFullName(String fullName) { + String[] names = fullName.split("\\."); + for (int i = 0; i < names.length - 1; i++) { + validateName(names[i], "Namespace part"); + } + validateName(names[names.length - 1], "Name"); + return fullName; + } + + private void validateName(String name, String what) { + NameValidator.Result result = nameValidator.validate(name); + if (!result.isOK()) { + throw new SchemaParseException(what + " \"" + name + "\" is invalid: " + result.getErrors()); + } + } + + public boolean hasNewSchemas() { + return !newSchemas.isEmpty(); + } + + public void commit() { + oldSchemas.putAll(newSchemas); + newSchemas.clear(); + } + + public void rollback() { + newSchemas.clear(); + } + + /** + * Return all known types by their fullname. + * + * @return a map of all types by their name + */ + public Map typesByName() { + LinkedHashMap result = new LinkedHashMap<>(); + result.putAll(oldSchemas); + result.putAll(newSchemas); + return result; + } +} diff --git a/lang/java/avro/src/main/java/org/apache/avro/Schema.java b/lang/java/avro/src/main/java/org/apache/avro/Schema.java index 38a6e4a9e42..f312ecfb6f2 100644 --- a/lang/java/avro/src/main/java/org/apache/avro/Schema.java +++ b/lang/java/avro/src/main/java/org/apache/avro/Schema.java @@ -25,6 +25,12 @@ import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.DoubleNode; import com.fasterxml.jackson.databind.node.NullNode; +import org.apache.avro.util.internal.Accessor; +import org.apache.avro.util.internal.Accessor.FieldAccessor; +import org.apache.avro.util.internal.JacksonUtils; +import org.apache.avro.util.internal.ThreadLocalWithInitial; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.ByteArrayOutputStream; import java.io.File; @@ -51,13 +57,6 @@ import java.util.stream.Collectors; import java.util.stream.StreamSupport; -import org.apache.avro.util.internal.Accessor; -import org.apache.avro.util.internal.Accessor.FieldAccessor; -import org.apache.avro.util.internal.JacksonUtils; -import org.apache.avro.util.internal.ThreadLocalWithInitial; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import static org.apache.avro.LogicalType.LOGICAL_TYPE_PROP; /** @@ -1490,7 +1489,7 @@ public NullSchema() { */ public static class Parser { private Names names = new Names(); - private final Schema.NameValidator validate; + private final NameValidator validate; private boolean validateDefaults = true; public Parser() { @@ -1760,18 +1759,18 @@ public Schema put(Name name, Schema schema) { } } - private static ThreadLocal validateNames = ThreadLocalWithInitial + private static ThreadLocal validateNames = ThreadLocalWithInitial .of(() -> NameValidator.UTF_VALIDATOR); private static String validateName(String name) { NameValidator.Result result = validateNames.get().validate(name); if (!result.isOK()) { - throw new SchemaParseException(result.errors); + throw new SchemaParseException(result.getErrors()); } return name; } - public static void setNameValidator(final Schema.NameValidator validator) { + public static void setNameValidator(final NameValidator validator) { Schema.validateNames.set(validator); } @@ -2311,84 +2310,6 @@ private static String getFieldAlias(Name record, String field, Map= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); - } - - private boolean isDigit(char c) { - return c >= '0' && c <= '9'; - } - - }; - - } - /** * No change is permitted on LockableArrayList once lock() has been called on * it. diff --git a/lang/java/avro/src/main/java/org/apache/avro/SchemaParser.java b/lang/java/avro/src/main/java/org/apache/avro/SchemaParser.java new file mode 100644 index 00000000000..dfb3c01f353 --- /dev/null +++ b/lang/java/avro/src/main/java/org/apache/avro/SchemaParser.java @@ -0,0 +1,249 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro; + +import org.apache.avro.util.UtfTextUtils; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.net.URI; +import java.nio.charset.Charset; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.ServiceLoader; + +/** + * Avro schema parser for text-based formats like JSON, IDL, etc. + * + *

+ * Parses formatted (i.e., text based) schemata from a given source using the + * available {@link FormattedSchemaParser} implementations, and returns the + * first result. This means it can transparently handle any schema format. The + * Avro project defines a JSON based format and an IDL format (the latter + * available as a separate dependency), but you can also provide your own. + *

+ * + *

+ * The parser can handle various text based sources. If the source contains a + * UTF encoded latin text based format it can even detect which UTF encoding was + * used (UTF-8, UTF16BE, UTF16LE, UTF-32BE or UTF32LE). + *

+ * + * @see FormattedSchemaParser + * @see UtfTextUtils + */ +public class SchemaParser { + private final ParseContext parseContext; + private final Collection formattedSchemaParsers; + + /** + * Create a schema parser. Initially, the list of known (named) schemata is + * empty. + */ + public SchemaParser() { + this.parseContext = new ParseContext(); + this.formattedSchemaParsers = new ArrayList<>(); + for (FormattedSchemaParser formattedSchemaParser : ServiceLoader.load(FormattedSchemaParser.class)) { + formattedSchemaParsers.add(formattedSchemaParser); + } + // Add the default / JSON parser last (not as a service, even though it + // implements the service interface), to allow implementations that parse JSON + // files into schemata differently. + formattedSchemaParsers.add(new JsonSchemaParser()); + } + + /** + * Parse an Avro schema from a file. The file content is assumed to be UTF-8 + * text. + * + * @param file the file to read + * @return the schema + * @throws IOException when the schema cannot be read + * @throws SchemaParseException if parsing the schema failed; contains + * suppressed underlying parse exceptions if + * available + * @see UtfTextUtils + */ + public Schema parse(File file) throws IOException, SchemaParseException { + return parse(file, null); + } + + /** + * Parse an Avro schema from a file written with a specific character set. + * + * @param file the file to read + * @param charset the character set of the file contents + * @return the schema + * @throws IOException when the schema cannot be read + * @throws SchemaParseException if parsing the schema failed; contains + * suppressed underlying parse exceptions if + * available + */ + public Schema parse(File file, Charset charset) throws IOException, SchemaParseException { + return parse(file.toPath(), charset); + } + + /** + * Parse an Avro schema from a file. The file content is assumed to be UTF-8 + * text. + * + * @param file the file to read + * @return the schema + * @throws IOException when the schema cannot be read + * @throws SchemaParseException if parsing the schema failed; contains + * suppressed underlying parse exceptions if + * available + * @see UtfTextUtils + */ + public Schema parse(Path file) throws IOException, SchemaParseException { + return parse(file, null); + } + + /** + * Parse an Avro schema from a file written with a specific character set. + * + * @param file the file to read + * @param charset the character set of the file contents + * @return the schema + * @throws IOException when the schema cannot be read + * @throws SchemaParseException if parsing the schema failed; contains + * suppressed underlying parse exceptions if + * available + */ + public Schema parse(Path file, Charset charset) throws IOException, SchemaParseException { + URI inputDir = file.getParent().toUri(); + try (InputStream stream = Files.newInputStream(file)) { + String formattedSchema = UtfTextUtils.readAllBytes(stream, charset); + return parse(inputDir, formattedSchema); + } + } + + /** + * Parse an Avro schema from an input stream. The stream content is assumed to + * be UTF-8 text. Note that the stream stays open after reading. + * + * @param in the stream to read + * @return the schema + * @throws IOException when the schema cannot be read + * @throws SchemaParseException if parsing the schema failed; contains + * suppressed underlying parse exceptions if + * available + * @see UtfTextUtils + */ + public Schema parse(InputStream in) throws IOException, SchemaParseException { + return parse(in, null); + } + + /** + * Parse an Avro schema from an input stream. Note that the stream stays open + * after reading. + * + * @param in the stream to read + * @param charset the character set of the stream contents + * @return the schema + * @throws IOException when the schema cannot be read + * @throws SchemaParseException if parsing the schema failed; contains + * suppressed underlying parse exceptions if + * available + */ + public Schema parse(InputStream in, Charset charset) throws IOException, SchemaParseException { + return parse(UtfTextUtils.readAllBytes(in, charset)); + } + + /** + * Parse an Avro schema from an input reader. + * + * @param in the stream to read + * @return the schema + * @throws IOException when the schema cannot be read + * @throws SchemaParseException if parsing the schema failed; contains + * suppressed underlying parse exceptions if + * available + */ + public Schema parse(Reader in) throws IOException, SchemaParseException { + return parse(UtfTextUtils.readAllChars(in)); + } + + /** + * Parse an Avro schema from a string. + * + * @param text the text to parse + * @return the schema + * @throws SchemaParseException if parsing the schema failed; contains + * suppressed underlying parse exceptions if + * available + */ + public Schema parse(CharSequence text) throws SchemaParseException { + try { + return parse(null, text); + } catch (IOException e) { + // This can only happen if parser implementations try to read other (related) + // schemata from somewhere. + throw new AvroRuntimeException("Could not read schema", e); + } + } + + /** + * Parse the given schema (string) within the specified context using all + * available {@link FormattedSchemaParser} implementations, collecting any + * {@link SchemaParseException}s that occur, and return the first successfully + * parsed schema. If all parsers fail, throw a {@code SchemaParseException} with + * all collected parse exceptions added as suppressed exceptions. Uses the base + * location of the schema (e.g., the directory where the schema file lives) if + * available. + * + * @param baseUri the base location of the schema, or {@code null} if + * not known + * @param formattedSchema the schema as text + * @return the parsed schema + * @throws IOException if thrown by one of the parsers + * @throws RuntimeException if thrown by one of the parsers + * @throws SchemaParseException when all parsers fail + */ + private Schema parse(URI baseUri, CharSequence formattedSchema) throws IOException, SchemaParseException { + List parseExceptions = new ArrayList<>(); + for (FormattedSchemaParser formattedSchemaParser : formattedSchemaParsers) { + try { + Schema schema = formattedSchemaParser.parse(parseContext, baseUri, formattedSchema); + if (parseContext.hasNewSchemas()) { + // Parsing succeeded: return the result. + parseContext.commit(); + return schema; + } + } catch (SchemaParseException e) { + parseContext.rollback(); + parseExceptions.add(e); + } + } + + // None of the available parsers succeeded + + if (parseExceptions.size() == 1) { + throw parseExceptions.get(0); + } + SchemaParseException parseException = new SchemaParseException( + "Could not parse the schema (the suppressed exceptions tell why)."); + parseExceptions.forEach(parseException::addSuppressed); + throw parseException; + } +} diff --git a/lang/java/avro/src/main/java/org/apache/avro/file/DataFileStream.java b/lang/java/avro/src/main/java/org/apache/avro/file/DataFileStream.java index 150d2ace9ba..e9b5ed38852 100644 --- a/lang/java/avro/src/main/java/org/apache/avro/file/DataFileStream.java +++ b/lang/java/avro/src/main/java/org/apache/avro/file/DataFileStream.java @@ -34,6 +34,7 @@ import org.apache.avro.AvroRuntimeException; import org.apache.avro.InvalidAvroMagicException; +import org.apache.avro.NameValidator; import org.apache.avro.Schema; import org.apache.avro.io.BinaryEncoder; import org.apache.avro.io.DecoderFactory; @@ -139,7 +140,7 @@ void initialize(InputStream in, byte[] magic) throws IOException { // finalize the header header.metaKeyList = Collections.unmodifiableList(header.metaKeyList); - header.schema = new Schema.Parser(Schema.NameValidator.NO_VALIDATION).setValidateDefaults(false) + header.schema = new Schema.Parser(NameValidator.NO_VALIDATION).setValidateDefaults(false) .parse(getMetaString(DataFileConstants.SCHEMA)); this.codec = resolveCodec(); reader.setSchema(header.schema); diff --git a/lang/java/avro/src/main/java/org/apache/avro/util/SchemaResolver.java b/lang/java/avro/src/main/java/org/apache/avro/util/SchemaResolver.java new file mode 100644 index 00000000000..c3a25a5e577 --- /dev/null +++ b/lang/java/avro/src/main/java/org/apache/avro/util/SchemaResolver.java @@ -0,0 +1,367 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro.util; + +import org.apache.avro.AvroTypeException; +import org.apache.avro.JsonProperties; +import org.apache.avro.ParseContext; +import org.apache.avro.Protocol; +import org.apache.avro.Schema; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.EnumSet; +import java.util.HashSet; +import java.util.IdentityHashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Function; +import java.util.stream.Collectors; + +import static org.apache.avro.Schema.Type.ARRAY; +import static org.apache.avro.Schema.Type.ENUM; +import static org.apache.avro.Schema.Type.FIXED; +import static org.apache.avro.Schema.Type.MAP; +import static org.apache.avro.Schema.Type.RECORD; +import static org.apache.avro.Schema.Type.UNION; + +/** + * Utility class to resolve schemas that are unavailable at the point they are + * referenced in the IDL. + */ +public final class SchemaResolver { + + private SchemaResolver() { + } + + private static final String UR_SCHEMA_ATTR = "org.apache.avro.idl.unresolved.name"; + + private static final String UR_SCHEMA_NAME = "UnresolvedSchema"; + + private static final String UR_SCHEMA_NS = "org.apache.avro.compiler"; + + private static final AtomicInteger COUNTER = new AtomicInteger(); + + /** + * Create a schema to represent an "unresolved" schema. (used to represent a + * schema whose definition does not exist, yet). + * + * @param name a schema name + * @return an unresolved schema for the given name + */ + public static Schema unresolvedSchema(final String name) { + Schema schema = Schema.createRecord(UR_SCHEMA_NAME + '_' + COUNTER.getAndIncrement(), "unresolved schema", + UR_SCHEMA_NS, false, Collections.emptyList()); + schema.addProp(UR_SCHEMA_ATTR, name); + return schema; + } + + /** + * Is this an unresolved schema. + * + * @param schema a schema + * @return whether the schema is an unresolved schema + */ + public static boolean isUnresolvedSchema(final Schema schema) { + return (schema.getType() == Schema.Type.RECORD && schema.getProp(UR_SCHEMA_ATTR) != null && schema.getName() != null + && schema.getName().startsWith(UR_SCHEMA_NAME) && UR_SCHEMA_NS.equals(schema.getNamespace())); + } + + /** + * Get the unresolved schema name. + * + * @param schema an unresolved schema + * @return the name of the unresolved schema + */ + public static String getUnresolvedSchemaName(final Schema schema) { + if (!isUnresolvedSchema(schema)) { + throw new IllegalArgumentException("Not a unresolved schema: " + schema); + } + return schema.getProp(UR_SCHEMA_ATTR); + } + + /** + * Is this an unresolved schema? + */ + public static boolean isFullyResolvedSchema(final Schema schema) { + if (isUnresolvedSchema(schema)) { + return false; + } else { + return Schemas.visit(schema, new IsResolvedSchemaVisitor()); + } + } + + /** + * Clone the provided schema while resolving all unreferenced schemas. + * + * @param parseContext the parse context with known names + * @param schema the schema to resolve + * @return a copy of the schema with all schemas resolved + */ + public static Schema resolve(final ParseContext parseContext, Schema schema) { + if (schema == null) { + return null; + } + ResolvingVisitor visitor = new ResolvingVisitor(schema, parseContext::resolve); + return Schemas.visit(schema, visitor); + } + + /** + * Clone all provided schemas while resolving all unreferenced schemas. + * + * @param parseContext the parse context with known names + * @param schemas the schemas to resolve + * @return a copy of all schemas with all schemas resolved + */ + public static Collection resolve(final ParseContext parseContext, Collection schemas) { + ResolvingVisitor visitor = new ResolvingVisitor(null, parseContext::resolve); + return schemas.stream().map(schema -> Schemas.visit(schema, visitor.withRoot(schema))).collect(Collectors.toList()); + } + + /** + * Will clone the provided protocol while resolving all unreferenced schemas + * + * @param parseContext the parse context with known names + * @param protocol the protocol to resolve + * @return a copy of the protocol with all schemas resolved + */ + public static Protocol resolve(ParseContext parseContext, final Protocol protocol) { + // Create an empty copy of the protocol + Protocol result = new Protocol(protocol.getName(), protocol.getDoc(), protocol.getNamespace()); + protocol.getObjectProps().forEach(((JsonProperties) result)::addProp); + + ResolvingVisitor visitor = new ResolvingVisitor(null, parseContext::resolve); + Function resolver = schema -> Schemas.visit(schema, visitor.withRoot(schema)); + + // Resolve all schemata in the protocol. + result.setTypes(protocol.getTypes().stream().map(resolver).collect(Collectors.toList())); + Map resultMessages = result.getMessages(); + protocol.getMessages().forEach((name, oldValue) -> { + Protocol.Message newValue; + if (oldValue.isOneWay()) { + newValue = result.createMessage(oldValue.getName(), oldValue.getDoc(), oldValue, + resolver.apply(oldValue.getRequest())); + } else { + Schema request = resolver.apply(oldValue.getRequest()); + Schema response = resolver.apply(oldValue.getResponse()); + Schema errors = resolver.apply(oldValue.getErrors()); + newValue = result.createMessage(oldValue.getName(), oldValue.getDoc(), oldValue, request, response, errors); + } + resultMessages.put(name, newValue); + }); + return result; + } + + /** + * This visitor checks if the current schema is fully resolved. + */ + public static final class IsResolvedSchemaVisitor implements SchemaVisitor { + boolean hasUnresolvedParts; + + IsResolvedSchemaVisitor() { + hasUnresolvedParts = false; + } + + @Override + public SchemaVisitorAction visitTerminal(Schema terminal) { + hasUnresolvedParts = isUnresolvedSchema(terminal); + return hasUnresolvedParts ? SchemaVisitorAction.TERMINATE : SchemaVisitorAction.CONTINUE; + } + + @Override + public SchemaVisitorAction visitNonTerminal(Schema nonTerminal) { + hasUnresolvedParts = isUnresolvedSchema(nonTerminal); + if (hasUnresolvedParts) { + return SchemaVisitorAction.TERMINATE; + } + if (nonTerminal.getType() == Schema.Type.RECORD && !nonTerminal.hasFields()) { + // We're still initializing the type... + return SchemaVisitorAction.SKIP_SUBTREE; + } + return SchemaVisitorAction.CONTINUE; + } + + @Override + public SchemaVisitorAction afterVisitNonTerminal(Schema nonTerminal) { + return SchemaVisitorAction.CONTINUE; + } + + @Override + public Boolean get() { + return !hasUnresolvedParts; + } + } + + /** + * This visitor creates clone of the visited Schemata, minus the specified + * schema properties, and resolves all unresolved schemas. + */ + public static final class ResolvingVisitor implements SchemaVisitor { + private static final Set CONTAINER_SCHEMA_TYPES = EnumSet.of(RECORD, ARRAY, MAP, UNION); + private static final Set NAMED_SCHEMA_TYPES = EnumSet.of(RECORD, ENUM, FIXED); + + private final Function symbolTable; + private final Set schemaPropertiesToRemove; + private final IdentityHashMap replace; + + private final Schema root; + + public ResolvingVisitor(final Schema root, final Function symbolTable, + String... schemaPropertiesToRemove) { + this(root, symbolTable, new HashSet<>(Arrays.asList(schemaPropertiesToRemove))); + } + + public ResolvingVisitor(final Schema root, final Function symbolTable, + Set schemaPropertiesToRemove) { + this.replace = new IdentityHashMap<>(); + this.symbolTable = symbolTable; + this.schemaPropertiesToRemove = schemaPropertiesToRemove; + + this.root = root; + } + + public ResolvingVisitor withRoot(Schema root) { + return new ResolvingVisitor(root, symbolTable, schemaPropertiesToRemove); + } + + @Override + public SchemaVisitorAction visitTerminal(final Schema terminal) { + Schema.Type type = terminal.getType(); + Schema newSchema; + if (CONTAINER_SCHEMA_TYPES.contains(type)) { + if (!replace.containsKey(terminal)) { + throw new IllegalStateException("Schema " + terminal + " must be already processed"); + } + return SchemaVisitorAction.CONTINUE; + } else if (type == ENUM) { + newSchema = Schema.createEnum(terminal.getName(), terminal.getDoc(), terminal.getNamespace(), + terminal.getEnumSymbols(), terminal.getEnumDefault()); + } else if (type == FIXED) { + newSchema = Schema.createFixed(terminal.getName(), terminal.getDoc(), terminal.getNamespace(), + terminal.getFixedSize()); + } else { + newSchema = Schema.create(type); + } + copyProperties(terminal, newSchema); + replace.put(terminal, newSchema); + return SchemaVisitorAction.CONTINUE; + } + + public void copyProperties(final Schema first, final Schema second) { + // Logical type + Optional.ofNullable(first.getLogicalType()).ifPresent(logicalType -> logicalType.addToSchema(second)); + + // Aliases (if applicable) + if (NAMED_SCHEMA_TYPES.contains(first.getType())) { + first.getAliases().forEach(second::addAlias); + } + + // Other properties + first.getObjectProps().forEach((name, value) -> { + if (!schemaPropertiesToRemove.contains(name)) { + second.addProp(name, value); + } + }); + } + + @Override + public SchemaVisitorAction visitNonTerminal(final Schema nt) { + Schema.Type type = nt.getType(); + if (type == RECORD) { + if (isUnresolvedSchema(nt)) { + // unresolved schema will get a replacement that we already encountered, + // or we will attempt to resolve. + final String unresolvedSchemaName = getUnresolvedSchemaName(nt); + Schema resSchema = symbolTable.apply(unresolvedSchemaName); + if (resSchema == null) { + throw new AvroTypeException("Undefined schema: " + unresolvedSchemaName); + } + Schema replacement = replace.computeIfAbsent(resSchema, schema -> { + Schemas.visit(schema, this); + return replace.get(schema); + }); + replace.put(nt, replacement); + } else { + // Create a clone without fields. Fields will be added in afterVisitNonTerminal. + Schema newSchema = Schema.createRecord(nt.getName(), nt.getDoc(), nt.getNamespace(), nt.isError()); + copyProperties(nt, newSchema); + replace.put(nt, newSchema); + } + } + return SchemaVisitorAction.CONTINUE; + } + + @Override + public SchemaVisitorAction afterVisitNonTerminal(final Schema nt) { + Schema.Type type = nt.getType(); + Schema newSchema; + switch (type) { + case RECORD: + if (!isUnresolvedSchema(nt)) { + newSchema = replace.get(nt); + // Check if we've already handled the replacement schema with a + // reentrant call to visit(...) from within the visitor. + if (!newSchema.hasFields()) { + List fields = nt.getFields(); + List newFields = new ArrayList<>(fields.size()); + for (Schema.Field field : fields) { + newFields.add(new Schema.Field(field, replace.get(field.schema()))); + } + newSchema.setFields(newFields); + } + } + return SchemaVisitorAction.CONTINUE; + case UNION: + List types = nt.getTypes(); + List newTypes = new ArrayList<>(types.size()); + for (Schema sch : types) { + newTypes.add(replace.get(sch)); + } + newSchema = Schema.createUnion(newTypes); + break; + case ARRAY: + newSchema = Schema.createArray(replace.get(nt.getElementType())); + break; + case MAP: + newSchema = Schema.createMap(replace.get(nt.getValueType())); + break; + default: + throw new IllegalStateException("Illegal type " + type + ", schema " + nt); + } + copyProperties(nt, newSchema); + replace.put(nt, newSchema); + return SchemaVisitorAction.CONTINUE; + } + + @Override + public Schema get() { + return replace.get(root); + } + + @Override + public String toString() { + return "ResolvingVisitor{symbolTable=" + symbolTable + ", schemaPropertiesToRemove=" + schemaPropertiesToRemove + + ", replace=" + replace + '}'; + } + } +} diff --git a/lang/java/idl/src/main/java/org/apache/avro/idl/SchemaVisitor.java b/lang/java/avro/src/main/java/org/apache/avro/util/SchemaVisitor.java similarity index 77% rename from lang/java/idl/src/main/java/org/apache/avro/idl/SchemaVisitor.java rename to lang/java/avro/src/main/java/org/apache/avro/util/SchemaVisitor.java index 0f9fcae5b68..1ac35baeda7 100644 --- a/lang/java/idl/src/main/java/org/apache/avro/idl/SchemaVisitor.java +++ b/lang/java/avro/src/main/java/org/apache/avro/util/SchemaVisitor.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.avro.idl; +package org.apache.avro.util; import org.apache.avro.Schema; @@ -44,4 +44,26 @@ public interface SchemaVisitor { * @return a value that will be returned by the visit method. */ T get(); + + enum SchemaVisitorAction { + + /** + * continue visit. + */ + CONTINUE, + /** + * terminate visit. + */ + TERMINATE, + /** + * when returned from pre non terminal visit method the children of the non + * terminal are skipped. afterVisitNonTerminal for the current schema will not + * be invoked. + */ + SKIP_SUBTREE, + /** + * Skip visiting the siblings of this schema. + */ + SKIP_SIBLINGS + } } diff --git a/lang/java/idl/src/main/java/org/apache/avro/idl/Schemas.java b/lang/java/avro/src/main/java/org/apache/avro/util/Schemas.java similarity index 89% rename from lang/java/idl/src/main/java/org/apache/avro/idl/Schemas.java rename to lang/java/avro/src/main/java/org/apache/avro/util/Schemas.java index da4b949d2bc..927a0c37b43 100644 --- a/lang/java/idl/src/main/java/org/apache/avro/idl/Schemas.java +++ b/lang/java/avro/src/main/java/org/apache/avro/util/Schemas.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.avro.idl; +package org.apache.avro.util; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; @@ -51,8 +51,7 @@ public static T visit(final Schema start, final SchemaVisitor visitor) { while ((current = dq.poll()) != null) { if (current instanceof Supplier) { // We are executing a non-terminal post visit. - @SuppressWarnings("unchecked") - SchemaVisitorAction action = ((Supplier) current).get(); + SchemaVisitor.SchemaVisitorAction action = ((Supplier) current).get(); switch (action) { case CONTINUE: break; @@ -107,14 +106,14 @@ public static T visit(final Schema start, final SchemaVisitor visitor) { private static boolean visitNonTerminal(final SchemaVisitor visitor, final Schema schema, final Deque dq, final Iterable itSupp) { - SchemaVisitorAction action = visitor.visitNonTerminal(schema); + SchemaVisitor.SchemaVisitorAction action = visitor.visitNonTerminal(schema); switch (action) { case CONTINUE: - dq.push((Supplier) () -> visitor.afterVisitNonTerminal(schema)); + dq.push((Supplier) () -> visitor.afterVisitNonTerminal(schema)); itSupp.forEach(dq::push); break; case SKIP_SUBTREE: - dq.push((Supplier) () -> visitor.afterVisitNonTerminal(schema)); + dq.push((Supplier) () -> visitor.afterVisitNonTerminal(schema)); break; case SKIP_SIBLINGS: while (dq.peek() instanceof Schema) { @@ -130,7 +129,7 @@ private static boolean visitNonTerminal(final SchemaVisitor visitor, final Sc } private static boolean visitTerminal(final SchemaVisitor visitor, final Schema schema, final Deque dq) { - SchemaVisitorAction action = visitor.visitTerminal(schema); + SchemaVisitor.SchemaVisitorAction action = visitor.visitTerminal(schema); switch (action) { case CONTINUE: break; diff --git a/lang/java/avro/src/main/java/org/apache/avro/util/UtfTextUtils.java b/lang/java/avro/src/main/java/org/apache/avro/util/UtfTextUtils.java new file mode 100644 index 00000000000..967a48bf364 --- /dev/null +++ b/lang/java/avro/src/main/java/org/apache/avro/util/UtfTextUtils.java @@ -0,0 +1,247 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro.util; + +import java.io.BufferedInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; + +/** + * Text utilities especially suited for UTF encoded bytes. + * + *

+ * When the character set is unknown, methods in this class assume UTF encoded + * text and try to detect the UTF variant (8/16/32 bits, big/little endian), + * using the BOM (if present) or an educated guess assuming the first character + * is in the range U+0000-U+00FF. This heuristic works for all latin text based + * formats, which includes Avro IDL, JSON, XML, etc. If the heuristic fails, + * UTF-8 is assumed. + *

+ * + * @see XML specification, + * appendix F: Autodetection of Character Encodings (Non-Normative) + */ +public class UtfTextUtils { + private static final int TRANSFER_BUFFER_SIZE = 4096; + /** + * JVM standard character set (but that doesn't have a constant in + * {@link StandardCharsets}) for UTF-32. + */ + private static final Charset UTF_32 = Charset.forName("UTF-32"); + /** + * JVM standard character set (but that doesn't have a constant in + * {@link StandardCharsets}) for UTF-32BE. + */ + private static final Charset UTF_32BE = Charset.forName("UTF-32BE"); + /** + * JVM standard character set (but that doesn't have a constant in + * {@link StandardCharsets}) for UTF-32LE. + */ + private static final Charset UTF_32LE = Charset.forName("UTF-32LE"); + + public static String asString(byte[] bytes, Charset charset) { + if (charset == null) { + charset = detectUtfCharset(bytes); + } + return skipBOM(new String(bytes, charset)); + } + + /** + * Reads the specified input stream as text. If {@code charset} is {@code null}, + * the method will assume UTF encoded text and attempt to detect the appropriate + * charset. + * + * @param input the input to read + * @param charset the character set of the input, if known + * @return all bytes, read into a string + * @throws IOException when reading the input fails for some reason + */ + public static String readAllBytes(InputStream input, Charset charset) throws IOException { + if (charset == null) { + input = ensureMarkSupport(input); + input.mark(4); + byte[] buffer = new byte[4]; + int bytesRead = fillBuffer(input, buffer); + input.reset(); + + charset = detectUtfCharset0(buffer, bytesRead); + + if (charset == null) { + throw new IOException("Unsupported UCS-4 variant (neither UTF-32BE nor UTF32-LE)"); + } + } + Reader reader = new InputStreamReader(input, charset); + return readAllChars(reader); + } + + private static InputStream ensureMarkSupport(InputStream input) { + if (input.markSupported()) { + return input; + } else { + return new BufferedInputStream(input); + } + } + + private static int fillBuffer(InputStream in, byte[] buf) throws IOException { + int remaining = buf.length; + int offset = 0; + while (remaining > 0) { + int bytesRead = in.read(buf, offset, remaining); + // As remaining > 0, bytesRead is either -1 or positive + if (bytesRead == -1) { + break; + } + offset += bytesRead; + remaining -= bytesRead; + } + return offset; + } + + public static String readAllChars(Reader input) throws IOException { + StringBuilder buffer = new StringBuilder(); + char[] charBuffer = new char[TRANSFER_BUFFER_SIZE]; + int charsRead; + while ((charsRead = input.read(charBuffer, 0, TRANSFER_BUFFER_SIZE)) >= 0) { + buffer.append(charBuffer, 0, charsRead); + } + return skipBOM(buffer); + } + + private static String skipBOM(CharSequence buffer) { + if (buffer.charAt(0) == '\uFEFF') { + return buffer.subSequence(1, buffer.length()).toString(); + } + return buffer.toString(); + } + + /** + * Assuming UTF encoded bytes, detect the UTF variant (8/16/32 bits, big/little + * endian). + * + *

+ * To ensure the most accurate detection, the algorithm requires at least 4 + * bytes. One should only provide less than 4 bytes of data if that is all there + * is. + *

+ * + *

+ * Detection is certain when a byte order mark (BOM) is used. Otherwise a + * heuristic is used, which works when the first character is from the first 256 + * characters from the BMP (U+0000-U+00FF). This works for all latin-based + * textual formats, like Avro IDL, JSON, YAML, XML, etc. + *

+ * + * @param firstFewBytes the first few bytes of the text to detect the character + * set of + * @return the character set to use + */ + public static Charset detectUtfCharset(byte[] firstFewBytes) { + Charset detectedCharset = detectUtfCharset0(firstFewBytes, firstFewBytes.length); + if (detectedCharset == null) { + throw new IllegalArgumentException("Unsupported UCS-4 variant (neither UTF-32BE nor UTF32-LE)"); + } + return detectedCharset; + } + + private static Charset detectUtfCharset0(byte[] firstFewBytes, int numBytes) { + // spotless:off + /* + * Lookup table, adapted from https://www.w3.org/TR/xml/#sec-guessing + * It omits non-UTF encodings (the 2nd and 3rd rows from the end). + * Note that the order (with respect to UTF-32 & UTF-16) is important! + * + * (the non-zero bytes encode the byte order mark, BOM) + * + * Match the 'magic bytes' in order, and take the first match: + * 00 00 FE FF -> UTF-32 (be) + * FF FE 00 00 -> UTF-32 (le) + * 00 00 FF FE -> unsupported UCS-4 (byte order 2143) + * FE FF 00 00 -> unsupported UCS-4 (byte order 3412) + * FE FF __ __ -> UTF-16 (be) + * FF FE __ __ -> UTF-16 (le) + * EF BB BF __ -> UTF-8 + * 00 00 00 __ -> UTF-32BE + * __ 00 00 00 -> UTF-32LE + * 00 00 __ 00 -> unsupported UCS-4 (byte order 2143) + * 00 __ 00 00 -> unsupported UCS-4 (byte order 3412) + * 00 __ __ __ -> UTF-16BE + * __ 00 __ __ -> UTF-16LE + * __ __ __ __ -> UTF-8 (fallback) + */ + // spotless:on + int quad = quad(firstFewBytes, numBytes); + int word = quad >>> 16; + if (numBytes > 3 && (quad == 0x0000FEFF || quad == 0xFFFE0000)) { + // With BOM: UTF-32 (Charset handles BOM & endianness) + return UTF_32; + } else if (numBytes > 3 && (quad == 0x0000FFFE || quad == 0xFEFF0000)) { + // With BOM: unsupported UCS-4 encoding (byte order 2143 resp. 3412) + return null; + } else if (numBytes > 1 && (word == 0xFEFF || word == 0xFFFE)) { + // With BOM: UTF-16 (Charset handles BOM & endianness) + return StandardCharsets.UTF_16; + } else if (numBytes > 2 && quad >>> 8 == 0xEFBBBF) { + // With BOM: UTF-8 (Charset does not handle a BOM, so our caller must skip it) + return StandardCharsets.UTF_8; + } else if (numBytes > 3 && (quad & 0xFFFFFF00) == 0) { + // Without BOM (i.e., a guess) + return UTF_32BE; + } else if (numBytes > 3 && (quad & 0x00FFFFFF) == 0) { + // Without BOM (i.e., a guess) + return UTF_32LE; + } else if (numBytes > 3 && (quad & 0xFFFF00FF) == 0 || (quad & 0xFF00FFFF) == 0) { + // Without BOM (i.e., a guess): unsupported UCS-4 encoding (byte order 2143 + // resp. 3412) + return null; + } else if (numBytes > 1 && (word & 0xFF00) == 0) { + // Without BOM (i.e., a guess) + return StandardCharsets.UTF_16BE; + } else if (numBytes > 1 && (word & 0x00FF) == 0) { + // Without BOM (i.e., a guess) + return StandardCharsets.UTF_16LE; + } else { + // Fallback + return StandardCharsets.UTF_8; + } + } + + private static int quad(byte[] bytes, int length) { + int quad = 0xFFFFFFFF; + switch (length) { + default: + quad = (quad & 0xFFFFFF00) | (bytes[3] & 0xFF); + // Fallthrough + case 3: + quad = (quad & 0xFFFF00FF) | (bytes[2] & 0xFF) << 8; + // Fallthrough + case 2: + quad = (quad & 0xFF00FFFF) | (bytes[1] & 0xFF) << 16; + // Fallthrough + case 1: + quad = (quad & 0x00FFFFFF) | (bytes[0] & 0xFF) << 24; + // Fallthrough + case 0: + break; + } + return quad; + } +} diff --git a/lang/java/avro/src/test/java/org/apache/avro/DummySchemaParser.java b/lang/java/avro/src/test/java/org/apache/avro/DummySchemaParser.java new file mode 100644 index 00000000000..4802aea0747 --- /dev/null +++ b/lang/java/avro/src/test/java/org/apache/avro/DummySchemaParser.java @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.net.URI; + +public class DummySchemaParser implements FormattedSchemaParser { + /** + * Logger for this class. + */ + private static final Logger LOGGER = LoggerFactory.getLogger(DummySchemaParser.class); + public static final String SCHEMA_TEXT_ONE = "one"; + public static final Schema FIXED_SCHEMA = Schema.createFixed("DummyOne", null, "tests", 42); + public static final String SCHEMA_TEXT_ERROR = "error"; + public static final String SCHEMA_TEXT_IO_ERROR = "io-error"; + public static final String ERROR_MESSAGE = "Syntax error"; + public static final String IO_ERROR_MESSAGE = "I/O error"; + + @Override + public Schema parse(ParseContext parseContext, URI baseUri, CharSequence formattedSchema) + throws IOException, SchemaParseException { + LOGGER.info("Using DummySchemaParser for {}", formattedSchema); + if (SCHEMA_TEXT_ONE.contentEquals(formattedSchema)) { + parseContext.put(FIXED_SCHEMA); + return FIXED_SCHEMA; + } else if (SCHEMA_TEXT_ERROR.contentEquals(formattedSchema)) { + throw new SchemaParseException(ERROR_MESSAGE); + } else if (SCHEMA_TEXT_IO_ERROR.contentEquals(formattedSchema)) { + throw new IOException(IO_ERROR_MESSAGE); + } + // Syntax not recognized + return null; + } +} diff --git a/lang/java/avro/src/test/java/org/apache/avro/ParseContextTest.java b/lang/java/avro/src/test/java/org/apache/avro/ParseContextTest.java new file mode 100644 index 00000000000..afd3a643570 --- /dev/null +++ b/lang/java/avro/src/test/java/org/apache/avro/ParseContextTest.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro; + +import org.apache.avro.util.SchemaResolver; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.EnumSet; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertNotSame; +import static org.junit.jupiter.api.Assertions.assertSame; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class ParseContextTest { + Schema fooRecord, fooRecordCopy, barEnum, bazFixed, mehRecord; + ParseContext fooBarBaz; + + @BeforeEach + public void setUp() throws Exception { + fooRecord = SchemaBuilder.record("ns.Foo").fields().endRecord(); + fooRecordCopy = SchemaBuilder.record("ns.Foo").fields().endRecord(); + barEnum = SchemaBuilder.enumeration("ns.Bar").symbols(); + bazFixed = SchemaBuilder.fixed("ns.Baz").size(8); + mehRecord = SchemaBuilder.record("ns.Meh").fields().endRecord(); + + fooBarBaz = new ParseContext(); + fooBarBaz.put(fooRecord); + fooBarBaz.put(barEnum); + fooBarBaz.put(bazFixed); + } + + @Test + public void checkNewNameContextContainsPrimitives() { + EnumSet complexTypes = EnumSet.of(Schema.Type.RECORD, Schema.Type.ENUM, Schema.Type.FIXED, + Schema.Type.UNION, Schema.Type.ARRAY, Schema.Type.MAP); + EnumSet primitives = EnumSet.complementOf(complexTypes); + + ParseContext context = new ParseContext(); + for (Schema.Type type : complexTypes) { + assertFalse(context.contains(type.getName())); + } + for (Schema.Type type : primitives) { + assertTrue(context.contains(type.getName())); + } + } + + @Test + public void primitivesAreNotCached() { + EnumSet primitives = EnumSet.complementOf(EnumSet.of(Schema.Type.RECORD, Schema.Type.ENUM, + Schema.Type.FIXED, Schema.Type.UNION, Schema.Type.ARRAY, Schema.Type.MAP)); + + ParseContext context = new ParseContext(); + for (Schema.Type type : primitives) { + Schema first = context.resolve(type.getName()); + Schema second = context.resolve(type.getName()); + assertEquals(first, second); + assertNotSame(first, second); + + first.addProp("logicalType", "brick"); + assertNotEquals(first, second); + } + } + + @Test + public void validateSchemaRetrievalFailure() { + Schema unknown = Schema.createFixed("unknown", null, null, 0); + + Schema unresolved = fooBarBaz.resolve("unknown"); + assertTrue(SchemaResolver.isUnresolvedSchema(unresolved)); + assertEquals(unknown.getFullName(), SchemaResolver.getUnresolvedSchemaName(unresolved)); + } + + @Test + public void validateSchemaRetrievalByFullName() { + assertSame(fooRecord, fooBarBaz.resolve(fooRecord.getFullName())); + } + + @Test + public void verifyPutIsIdempotent() { + ParseContext context = new ParseContext(); + assertNotEquals(fooRecord, context.resolve(fooRecord.getFullName())); + + context.put(fooRecord); + assertEquals(fooRecord, context.resolve(fooRecord.getFullName())); + + context.put(fooRecord); + assertEquals(fooRecord, context.resolve(fooRecord.getFullName())); + } + + @Test + public void verifyPutOnlyAcceptsNamedSchemas() { + ParseContext context = new ParseContext(); + assertThrows(AvroRuntimeException.class, () -> context.put(Schema.create(Schema.Type.STRING))); + } + + @Test + public void verifyAddDoesNotAllowChangingSchemas() { + Schema fooEnum = SchemaBuilder.enumeration("ns.Foo").symbols(); + + ParseContext context = new ParseContext(); + context.put(fooRecord); + assertThrows(AvroRuntimeException.class, () -> context.put(fooEnum)); + } +} diff --git a/lang/java/avro/src/test/java/org/apache/avro/SchemaNameValidatorTest.java b/lang/java/avro/src/test/java/org/apache/avro/SchemaNameValidatorTest.java index 6846c4434cf..871c172875d 100644 --- a/lang/java/avro/src/test/java/org/apache/avro/SchemaNameValidatorTest.java +++ b/lang/java/avro/src/test/java/org/apache/avro/SchemaNameValidatorTest.java @@ -28,32 +28,32 @@ class SchemaNameValidatorTest { @ParameterizedTest @MethodSource("data") - void validator(Schema.NameValidator validator, String input, boolean expectedResult) { - Schema.NameValidator.Result result = validator.validate(input); + void validator(NameValidator validator, String input, boolean expectedResult) { + NameValidator.Result result = validator.validate(input); Assertions.assertEquals(expectedResult, result.isOK(), result.getErrors()); } static Stream data() { - return Stream.of(Arguments.of(Schema.NameValidator.UTF_VALIDATOR, null, false), // null not accepted - Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, null, false), // null not accepted - Arguments.of(Schema.NameValidator.UTF_VALIDATOR, "", false), // empty not accepted - Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, "", false), // empty not accepted - Arguments.of(Schema.NameValidator.UTF_VALIDATOR, "Hello world", false), // space not accepted - Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, "Hello world", false), // space not accepted - Arguments.of(Schema.NameValidator.UTF_VALIDATOR, "H&", false), // non letter or digit not accepted - Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, "H&", false), // non letter or digit not accepted - Arguments.of(Schema.NameValidator.UTF_VALIDATOR, "H=", false), // non letter or digit not accepted - Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, "H=", false), // non letter or digit not accepted - Arguments.of(Schema.NameValidator.UTF_VALIDATOR, "H]", false), // non letter or digit not accepted - Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, "H]", false), // non letter or digit not accepted - Arguments.of(Schema.NameValidator.UTF_VALIDATOR, "Hello_world", true), - Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, "Hello_world", true), - Arguments.of(Schema.NameValidator.UTF_VALIDATOR, "éàçô", true), // Accept accent - Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, "éàçô", false), // Not Accept accent - Arguments.of(Schema.NameValidator.UTF_VALIDATOR, "5éàçô", false), // can't start with number - Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, "5éàçô", false), // can't start with number - Arguments.of(Schema.NameValidator.UTF_VALIDATOR, "_Hello_world", true), - Arguments.of(Schema.NameValidator.STRICT_VALIDATOR, "_Hello_world", true)); + return Stream.of(Arguments.of(NameValidator.UTF_VALIDATOR, null, false), // null not accepted + Arguments.of(NameValidator.STRICT_VALIDATOR, null, false), // null not accepted + Arguments.of(NameValidator.UTF_VALIDATOR, "", false), // empty not accepted + Arguments.of(NameValidator.STRICT_VALIDATOR, "", false), // empty not accepted + Arguments.of(NameValidator.UTF_VALIDATOR, "Hello world", false), // space not accepted + Arguments.of(NameValidator.STRICT_VALIDATOR, "Hello world", false), // space not accepted + Arguments.of(NameValidator.UTF_VALIDATOR, "H&", false), // non letter or digit not accepted + Arguments.of(NameValidator.STRICT_VALIDATOR, "H&", false), // non letter or digit not accepted + Arguments.of(NameValidator.UTF_VALIDATOR, "H=", false), // non letter or digit not accepted + Arguments.of(NameValidator.STRICT_VALIDATOR, "H=", false), // non letter or digit not accepted + Arguments.of(NameValidator.UTF_VALIDATOR, "H]", false), // non letter or digit not accepted + Arguments.of(NameValidator.STRICT_VALIDATOR, "H]", false), // non letter or digit not accepted + Arguments.of(NameValidator.UTF_VALIDATOR, "Hello_world", true), + Arguments.of(NameValidator.STRICT_VALIDATOR, "Hello_world", true), + Arguments.of(NameValidator.UTF_VALIDATOR, "éàçô", true), // Accept accent + Arguments.of(NameValidator.STRICT_VALIDATOR, "éàçô", false), // Not Accept accent + Arguments.of(NameValidator.UTF_VALIDATOR, "5éàçô", false), // can't start with number + Arguments.of(NameValidator.STRICT_VALIDATOR, "5éàçô", false), // can't start with number + Arguments.of(NameValidator.UTF_VALIDATOR, "_Hello_world", true), + Arguments.of(NameValidator.STRICT_VALIDATOR, "_Hello_world", true)); } } diff --git a/lang/java/avro/src/test/java/org/apache/avro/TestDataFileReader.java b/lang/java/avro/src/test/java/org/apache/avro/TestDataFileReader.java index 4b4f1c2d35a..6ed6b35cc50 100644 --- a/lang/java/avro/src/test/java/org/apache/avro/TestDataFileReader.java +++ b/lang/java/avro/src/test/java/org/apache/avro/TestDataFileReader.java @@ -90,7 +90,7 @@ void throttledInputStream() throws IOException { // magic header check. This happens with throttled input stream, // where we read into buffer less bytes than requested. - Schema legacySchema = new Schema.Parser(Schema.NameValidator.NO_VALIDATION).setValidateDefaults(false) + Schema legacySchema = new Schema.Parser(NameValidator.NO_VALIDATION).setValidateDefaults(false) .parse("{\"type\": \"record\", \"name\": \"TestSchema\", \"fields\": " + "[ {\"name\": \"id\", \"type\": [\"long\", \"null\"], \"default\": null}]}"); File f = dataDir.resolve("testThrottledInputStream.avro").toFile(); @@ -149,7 +149,7 @@ void inputStreamEOF() throws IOException { // AVRO-2944 describes hanging/failure in reading Avro file with performing // magic header check. This potentially happens with a defective input stream // where a -1 value is unexpectedly returned from a read. - Schema legacySchema = new Schema.Parser(Schema.NameValidator.NO_VALIDATION).setValidateDefaults(false) + Schema legacySchema = new Schema.Parser(NameValidator.NO_VALIDATION).setValidateDefaults(false) .parse("{\"type\": \"record\", \"name\": \"TestSchema\", \"fields\": " + "[ {\"name\": \"id\", \"type\": [\"long\", \"null\"], \"default\": null}]}"); File f = dataDir.resolve("testInputStreamEOF.avro").toFile(); @@ -198,7 +198,7 @@ void ignoreSchemaValidationOnRead() throws IOException { // This schema has an accent in the name and the default for the field doesn't // match the first type in the union. A Java SDK in the past could create a file // containing this schema. - Schema legacySchema = new Schema.Parser(Schema.NameValidator.NO_VALIDATION).setValidateDefaults(false) + Schema legacySchema = new Schema.Parser(NameValidator.NO_VALIDATION).setValidateDefaults(false) .parse("{\"type\": \"record\", \"name\": \"InvalidAccëntWithInvalidNull\", \"fields\": " + "[ {\"name\": \"id\", \"type\": [\"long\", \"null\"], \"default\": null}]}"); diff --git a/lang/java/avro/src/test/java/org/apache/avro/TestSchema.java b/lang/java/avro/src/test/java/org/apache/avro/TestSchema.java index 64748da1364..805f2b80b2a 100644 --- a/lang/java/avro/src/test/java/org/apache/avro/TestSchema.java +++ b/lang/java/avro/src/test/java/org/apache/avro/TestSchema.java @@ -553,14 +553,14 @@ public void parseAliases() throws JsonProcessingException { @Test void testContentAfterAvsc() { - Schema.Parser parser = new Schema.Parser(Schema.NameValidator.UTF_VALIDATOR); + Schema.Parser parser = new Schema.Parser(NameValidator.UTF_VALIDATOR); parser.setValidateDefaults(true); assertThrows(SchemaParseException.class, () -> parser.parse("{\"type\": \"string\"}; DROP TABLE STUDENTS")); } @Test void testContentAfterAvscInInputStream() throws Exception { - Schema.Parser parser = new Schema.Parser(Schema.NameValidator.UTF_VALIDATOR); + Schema.Parser parser = new Schema.Parser(NameValidator.UTF_VALIDATOR); parser.setValidateDefaults(true); String avsc = "{\"type\": \"string\"}; DROP TABLE STUDENTS"; ByteArrayInputStream is = new ByteArrayInputStream(avsc.getBytes(StandardCharsets.UTF_8)); @@ -576,7 +576,7 @@ void testContentAfterAvscInFile() throws Exception { writer.flush(); } - Schema.Parser parser = new Schema.Parser(Schema.NameValidator.UTF_VALIDATOR); + Schema.Parser parser = new Schema.Parser(NameValidator.UTF_VALIDATOR); parser.setValidateDefaults(true); assertThrows(SchemaParseException.class, () -> parser.parse(avscFile)); } diff --git a/lang/java/avro/src/test/java/org/apache/avro/TestSchemaBuilder.java b/lang/java/avro/src/test/java/org/apache/avro/TestSchemaBuilder.java index 293554e8e4b..fdb18d0f87c 100644 --- a/lang/java/avro/src/test/java/org/apache/avro/TestSchemaBuilder.java +++ b/lang/java/avro/src/test/java/org/apache/avro/TestSchemaBuilder.java @@ -882,7 +882,7 @@ void namespaceDefaulting() { void namesAcceptAll() throws InterruptedException { // Ensure that Schema.setNameValidator won't interfere with others unit tests. Runnable r = () -> { - Schema.setNameValidator(Schema.NameValidator.NO_VALIDATION); + Schema.setNameValidator(NameValidator.NO_VALIDATION); final Schema schema = SchemaBuilder.record("7name").fields().name("123").type(Schema.create(Schema.Type.INT)) .noDefault().endRecord(); Assertions.assertNotNull(schema); diff --git a/lang/java/avro/src/test/java/org/apache/avro/TestSchemaParser.java b/lang/java/avro/src/test/java/org/apache/avro/TestSchemaParser.java new file mode 100644 index 00000000000..dc0c77431fe --- /dev/null +++ b/lang/java/avro/src/test/java/org/apache/avro/TestSchemaParser.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro; + +import com.fasterxml.jackson.core.JsonParseException; +import org.junit.jupiter.api.Test; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.StringReader; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; + +import static java.util.Collections.singletonList; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class TestSchemaParser { + private static final Schema SCHEMA_REAL = Schema.createFixed("Real", null, "tests", 42); + private static final String SCHEMA_JSON = SCHEMA_REAL.toString(false); + + @Test + void testParseFile() throws IOException { + Path tempFile = Files.createTempFile("TestSchemaParser", null); + Files.write(tempFile, singletonList(SCHEMA_JSON)); + + Schema schema = new SchemaParser().parse(tempFile.toFile()); + assertEquals(SCHEMA_REAL, schema); + } + + @Test + void testParsePath() throws IOException { + Path tempFile = Files.createTempFile("TestSchemaParser", null); + Files.write(tempFile, singletonList(SCHEMA_JSON)); + + Schema schema = new SchemaParser().parse(tempFile); + assertEquals(SCHEMA_REAL, schema); + } + + @Test + void testParseReader() throws IOException { + Schema schema = new SchemaParser().parse(new StringReader(SCHEMA_JSON)); + assertEquals(SCHEMA_REAL, schema); + } + + @Test + void testParseStream() throws IOException { + Schema schema = new SchemaParser().parse(new ByteArrayInputStream(SCHEMA_JSON.getBytes(StandardCharsets.UTF_16))); + assertEquals(SCHEMA_REAL, schema); + } + + @Test + void testParseTextWithFallbackJsonParser() { + Schema schema = new SchemaParser().parse(SCHEMA_JSON); + assertEquals(SCHEMA_REAL, schema); + } + + @Test + void testParseByCustomParser() { + Schema schema = new SchemaParser().parse(DummySchemaParser.SCHEMA_TEXT_ONE); + assertEquals(DummySchemaParser.FIXED_SCHEMA, schema); + } + + @Test + void testSingleParseError() { + SchemaParseException parseException = assertThrows(SchemaParseException.class, + () -> new SchemaParser().parse("foo")); + assertEquals(JsonParseException.class, parseException.getCause().getClass()); + assertEquals(0, parseException.getSuppressed().length); + } + + @Test + void testMultipleParseErrors() { + SchemaParseException parseException = assertThrows(SchemaParseException.class, + () -> new SchemaParser().parse(DummySchemaParser.SCHEMA_TEXT_ERROR)); + assertTrue(parseException.getMessage().startsWith("Could not parse the schema")); + Throwable[] suppressed = parseException.getSuppressed(); + assertEquals(2, suppressed.length); + assertEquals(DummySchemaParser.ERROR_MESSAGE, suppressed[0].getMessage()); + assertEquals(JsonParseException.class, suppressed[1].getCause().getClass()); + } + + @Test + void testIOFailureWhileParsingText() { + AvroRuntimeException exception = assertThrows(AvroRuntimeException.class, + () -> new SchemaParser().parse(DummySchemaParser.SCHEMA_TEXT_IO_ERROR)); + assertEquals(IOException.class, exception.getCause().getClass()); + assertEquals(DummySchemaParser.IO_ERROR_MESSAGE, exception.getCause().getMessage()); + } +} diff --git a/lang/java/avro/src/test/java/org/apache/avro/reflect/TestReflect.java b/lang/java/avro/src/test/java/org/apache/avro/reflect/TestReflect.java index 5f52a2cf789..2a75916169d 100644 --- a/lang/java/avro/src/test/java/org/apache/avro/reflect/TestReflect.java +++ b/lang/java/avro/src/test/java/org/apache/avro/reflect/TestReflect.java @@ -38,6 +38,7 @@ import org.apache.avro.AvroRuntimeException; import org.apache.avro.AvroTypeException; import org.apache.avro.JsonProperties; +import org.apache.avro.NameValidator; import org.apache.avro.Protocol; import org.apache.avro.Schema; import org.apache.avro.Schema.Field; @@ -1260,7 +1261,7 @@ private static class Z { @Test void dollarTerminatedNamespaceCompatibility() { ReflectData data = ReflectData.get(); - Schema s = new Schema.Parser(Schema.NameValidator.NO_VALIDATION).parse( + Schema s = new Schema.Parser(NameValidator.NO_VALIDATION).parse( "{\"type\":\"record\",\"name\":\"Z\",\"namespace\":\"org.apache.avro.reflect.TestReflect$\",\"fields\":[]}"); assertEquals(data.getSchema(data.getClass(s)).toString(), "{\"type\":\"record\",\"name\":\"Z\",\"namespace\":\"org.apache.avro.reflect.TestReflect\",\"fields\":[]}"); @@ -1270,7 +1271,7 @@ void dollarTerminatedNamespaceCompatibility() { void dollarTerminatedNestedStaticClassNamespaceCompatibility() { ReflectData data = ReflectData.get(); // Older versions of Avro generated this namespace on nested records. - Schema s = new Schema.Parser(Schema.NameValidator.NO_VALIDATION).parse( + Schema s = new Schema.Parser(NameValidator.NO_VALIDATION).parse( "{\"type\":\"record\",\"name\":\"AnotherSampleRecord\",\"namespace\":\"org.apache.avro.reflect.TestReflect$SampleRecord\",\"fields\":[]}"); assertThat(data.getSchema(data.getClass(s)).getFullName(), is("org.apache.avro.reflect.TestReflect.SampleRecord.AnotherSampleRecord")); diff --git a/lang/java/idl/src/test/java/org/apache/avro/idl/TestSchemaResolver.java b/lang/java/avro/src/test/java/org/apache/avro/util/TestSchemaResolver.java similarity index 62% rename from lang/java/idl/src/test/java/org/apache/avro/idl/TestSchemaResolver.java rename to lang/java/avro/src/test/java/org/apache/avro/util/TestSchemaResolver.java index 70488232581..1a340b7fa5d 100644 --- a/lang/java/idl/src/test/java/org/apache/avro/idl/TestSchemaResolver.java +++ b/lang/java/avro/src/test/java/org/apache/avro/util/TestSchemaResolver.java @@ -1,3 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + /* * Copyright 2017 The Apache Software Foundation. * @@ -13,32 +31,25 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.avro.idl; +package org.apache.avro.util; -import org.apache.avro.Protocol; import org.apache.avro.Schema; import org.apache.avro.SchemaBuilder; -import org.junit.Assert; import org.junit.Test; import java.io.IOException; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.io.File; -import java.io.IOException; -import java.nio.file.Path; -import java.nio.file.Paths; public class TestSchemaResolver { @Test public void testResolving() throws IOException { - Path testIdl = Paths.get(".", "src", "test", "idl", "cycle.avdl").toAbsolutePath(); - IdlReader parser = new IdlReader(); - IdlFile idlFile = parser.parse(testIdl); - Protocol protocol = idlFile.getProtocol(); - System.out.println(protocol); - Assert.assertEquals(5, protocol.getTypes().size()); + // Path testIdl = Paths.get(".", "src", "test", "idl", + // "cycle.avdl").toAbsolutePath(); + // IdlReader parser = new IdlReader(); + // IdlFile idlFile = parser.parse(testIdl); + // Protocol protocol = idlFile.getProtocol(); + // System.out.println(protocol); + // Assert.assertEquals(5, protocol.getTypes().size()); } @Test(expected = IllegalArgumentException.class) diff --git a/lang/java/idl/src/test/java/org/apache/avro/idl/TestSchemas.java b/lang/java/avro/src/test/java/org/apache/avro/util/TestSchemas.java similarity index 90% rename from lang/java/idl/src/test/java/org/apache/avro/idl/TestSchemas.java rename to lang/java/avro/src/test/java/org/apache/avro/util/TestSchemas.java index 000ba20dbcb..6609819299c 100644 --- a/lang/java/idl/src/test/java/org/apache/avro/idl/TestSchemas.java +++ b/lang/java/avro/src/test/java/org/apache/avro/util/TestSchemas.java @@ -1,3 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + /* * Copyright 2017 The Apache Software Foundation. * @@ -13,7 +31,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.avro.idl; +package org.apache.avro.util; import org.apache.avro.Schema; import org.junit.Assert; diff --git a/lang/java/avro/src/test/java/org/apache/avro/util/UtfTextUtilsTest.java b/lang/java/avro/src/test/java/org/apache/avro/util/UtfTextUtilsTest.java new file mode 100644 index 00000000000..6c525e6d39a --- /dev/null +++ b/lang/java/avro/src/test/java/org/apache/avro/util/UtfTextUtilsTest.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro.util; + +import org.junit.jupiter.api.Test; + +import java.io.ByteArrayInputStream; +import java.io.FilterInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +@SuppressWarnings("SpellCheckingInspection") +class UtfTextUtilsTest { + @Test + void validateCharsetDetectionWithBOM() { + assertEquals("UTF-32", testDetection("0000FEFF").name()); + assertEquals("UTF-32", testDetection("FFFE0000").name()); + assertEquals("UTF-16", testDetection("FEFF0041").name()); + assertEquals("UTF-16", testDetection("FFFE4100").name()); + assertEquals("UTF-8", testDetection("EFBBBF41").name()); + + // Invalid UCS-4 encodings: these we're certain we cannot handle. + assertThrows(IllegalArgumentException.class, () -> testDetection("0000FFFE")); + assertThrows(IllegalArgumentException.class, () -> testDetection("FEFF0000")); + } + + @Test + void validateCharsetDetectionWithoutBOM() { + assertEquals("UTF-32BE", testDetection("00000041").name()); + assertEquals("UTF-32LE", testDetection("41000000").name()); + assertEquals("UTF-16BE", testDetection("00410042").name()); + assertEquals("UTF-16LE", testDetection("41004200").name()); + assertEquals("UTF-8", testDetection("41424344").name()); + + assertEquals("UTF-8", testDetection("414243").name()); + + assertEquals("UTF-16BE", testDetection("0041").name()); + assertEquals("UTF-16LE", testDetection("4100").name()); + assertEquals("UTF-8", testDetection("4142").name()); + + assertEquals("UTF-8", testDetection("41").name()); + + assertEquals("UTF-8", testDetection("").name()); + + // Invalid UCS-4 encodings: these we're fairly certain we cannot handle. + assertThrows(IllegalArgumentException.class, () -> testDetection("00004100")); + assertThrows(IllegalArgumentException.class, () -> testDetection("00410000")); + } + + private Charset testDetection(String hexBytes) { + return UtfTextUtils.detectUtfCharset(hexBytes(hexBytes)); + } + + private static byte[] hexBytes(String hexBytes) { + byte[] bytes = new byte[hexBytes.length() / 2]; + for (int i = 0; i < bytes.length; i++) { + int index = i * 2; + bytes[i] = (byte) Integer.parseUnsignedInt(hexBytes.substring(index, index + 2), 16); + } + return bytes; + } + + @Test + void validateTextConversionFromBytes() { + assertEquals("A", UtfTextUtils.asString(hexBytes("EFBBBF41"), StandardCharsets.UTF_8)); + assertEquals("A", UtfTextUtils.asString(hexBytes("EFBBBF41"), null)); + + assertEquals("A", UtfTextUtils.asString(hexBytes("41"), StandardCharsets.UTF_8)); + assertEquals("A", UtfTextUtils.asString(hexBytes("41"), null)); + } + + @Test + void validateTextConversionFromStreams() throws IOException { + assertEquals("A", + UtfTextUtils.readAllBytes(new ByteArrayInputStream(hexBytes("EFBBBF41")), StandardCharsets.UTF_8)); + assertEquals("A", UtfTextUtils.readAllBytes(new ByteArrayInputStream(hexBytes("EFBBBF41")), null)); + + assertEquals("A", UtfTextUtils.readAllBytes(new ByteArrayInputStream(hexBytes("41")), StandardCharsets.UTF_8)); + assertEquals("A", UtfTextUtils.readAllBytes(new ByteArrayInputStream(hexBytes("41")), null)); + + // Invalid UCS-4 encoding should throw an IOException instead of an + // IllegalArgumentException. + assertThrows(IOException.class, + () -> UtfTextUtils.readAllBytes(new ByteArrayInputStream(hexBytes("0000FFFE")), null)); + } + + @Test + void validateSupportForUnmarkableStreams() throws IOException { + assertEquals("ABCD", + UtfTextUtils.readAllBytes(new UnmarkableInputStream(new ByteArrayInputStream(hexBytes("41424344"))), null)); + } + + private static class UnmarkableInputStream extends FilterInputStream { + public UnmarkableInputStream(InputStream input) { + super(input); + } + + @Override + public synchronized void mark(int ignored) { + } + + @Override + public synchronized void reset() throws IOException { + throw new IOException("mark/reset not supported"); + } + + @Override + public boolean markSupported() { + return false; + } + } +} diff --git a/lang/java/avro/src/test/resources/META-INF/services/org.apache.avro.FormattedSchemaParser b/lang/java/avro/src/test/resources/META-INF/services/org.apache.avro.FormattedSchemaParser new file mode 100644 index 00000000000..b2db6ddb269 --- /dev/null +++ b/lang/java/avro/src/test/resources/META-INF/services/org.apache.avro.FormattedSchemaParser @@ -0,0 +1,18 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +org.apache.avro.DummySchemaParser diff --git a/lang/java/build.sh b/lang/java/build.sh index 96fdb3489cb..976f67dab4e 100755 --- a/lang/java/build.sh +++ b/lang/java/build.sh @@ -16,6 +16,7 @@ # limitations under the License. set -e +set -x usage() { echo "Usage: $0 {lint|test|dist|clean}" @@ -33,7 +34,7 @@ main() { test) mvn -B test # Test the modules that depend on hadoop using Hadoop 2 - mvn -B test -Phadoop2 + mvn -Dmaven.build.cache.enabled=false -B test -Phadoop2 ;; dist) mvn -P dist package -DskipTests javadoc:aggregate diff --git a/lang/java/idl/pom.xml b/lang/java/idl/pom.xml index 6363ec150a5..7826cf42b89 100644 --- a/lang/java/idl/pom.xml +++ b/lang/java/idl/pom.xml @@ -102,6 +102,7 @@ ${project.basedir}/../../../share/idl_grammar ${project.basedir}/../../../share/idl_grammar/imports + true true false diff --git a/lang/java/idl/src/main/java/org/apache/avro/idl/IdlFile.java b/lang/java/idl/src/main/java/org/apache/avro/idl/IdlFile.java index b3777c9f790..75d0f73faaf 100644 --- a/lang/java/idl/src/main/java/org/apache/avro/idl/IdlFile.java +++ b/lang/java/idl/src/main/java/org/apache/avro/idl/IdlFile.java @@ -34,21 +34,18 @@ public class IdlFile { private final Schema mainSchema; private final Protocol protocol; - private final String namespace; private final Map namedSchemas; private final List warnings; IdlFile(Protocol protocol, List warnings) { - this(protocol.getNamespace(), protocol.getTypes(), null, protocol, warnings); + this(protocol.getTypes(), null, protocol, warnings); } - IdlFile(String namespace, Schema mainSchema, Iterable schemas, List warnings) { - this(namespace, schemas, mainSchema, null, warnings); + IdlFile(Schema mainSchema, Iterable schemas, List warnings) { + this(schemas, mainSchema, null, warnings); } - private IdlFile(String namespace, Iterable schemas, Schema mainSchema, Protocol protocol, - List warnings) { - this.namespace = namespace; + private IdlFile(Iterable schemas, Schema mainSchema, Protocol protocol, List warnings) { this.namedSchemas = new LinkedHashMap<>(); for (Schema namedSchema : schemas) { this.namedSchemas.put(namedSchema.getFullName(), namedSchema); @@ -82,13 +79,6 @@ public List getWarnings(String importFile) { .collect(Collectors.toList()); } - /** - * The default namespace to resolve schema names against. - */ - public String getNamespace() { - return namespace; - } - /** * The named schemas defined by the IDL file, mapped by their full name. */ @@ -105,14 +95,7 @@ public Map getNamedSchemas() { * @return the schema, or {@code null} if it does not exist */ public Schema getNamedSchema(String name) { - Schema result = namedSchemas.get(name); - if (result != null) { - return result; - } - if (namespace != null && !name.contains(".")) { - result = namedSchemas.get(namespace + '.' + name); - } - return result; + return namedSchemas.get(name); } // Visible for testing diff --git a/lang/java/idl/src/main/java/org/apache/avro/idl/IdlReader.java b/lang/java/idl/src/main/java/org/apache/avro/idl/IdlReader.java index f2419f5f551..047d162879b 100644 --- a/lang/java/idl/src/main/java/org/apache/avro/idl/IdlReader.java +++ b/lang/java/idl/src/main/java/org/apache/avro/idl/IdlReader.java @@ -35,8 +35,11 @@ import org.antlr.v4.runtime.Recognizer; import org.antlr.v4.runtime.Token; import org.apache.avro.JsonProperties; +import org.apache.avro.JsonSchemaParser; import org.apache.avro.LogicalType; import org.apache.avro.LogicalTypes; +import org.apache.avro.NameValidator; +import org.apache.avro.ParseContext; import org.apache.avro.Protocol; import org.apache.avro.Schema; import org.apache.avro.SchemaParseException; @@ -68,6 +71,8 @@ import org.apache.avro.idl.IdlParser.SchemaPropertyContext; import org.apache.avro.idl.IdlParser.UnionTypeContext; import org.apache.avro.idl.IdlParser.VariableDeclarationContext; +import org.apache.avro.util.SchemaResolver; +import org.apache.avro.util.UtfTextUtils; import org.apache.avro.util.internal.Accessor; import org.apache.commons.text.StringEscapeUtils; @@ -99,7 +104,6 @@ import java.util.regex.Pattern; import static java.util.Collections.singleton; -import static java.util.Collections.unmodifiableMap; public class IdlReader { /** @@ -134,50 +138,47 @@ public void syntaxError(Recognizer recognizer, Object offendingSymbol, int * Predicate to check for valid names. Should probably be delegated to the * Schema class. */ - private static final Predicate VALID_NAME = Pattern.compile("[_\\p{L}][_\\p{L}\\d]*").asPredicate(); + private static final Predicate VALID_NAME = Pattern + .compile("[_\\p{L}][_\\p{LD}]*", Pattern.UNICODE_CHARACTER_CLASS | Pattern.UNICODE_CASE | Pattern.CANON_EQ) + .asPredicate(); + private static final Set INVALID_TYPE_NAMES = new HashSet<>(Arrays.asList("boolean", "int", "long", "float", "double", "bytes", "string", "null", "date", "time_ms", "timestamp_ms", "localtimestamp_ms", "uuid")); private static final String CLASSPATH_SCHEME = "classpath"; private final Set readLocations; - private final Map names; + private final ParseContext parseContext; public IdlReader() { - readLocations = new HashSet<>(); - names = new LinkedHashMap<>(); + this(new ParseContext()); } - public Map getTypes() { - return unmodifiableMap(names); + public IdlReader(NameValidator nameValidator) { + this(new ParseContext(nameValidator)); } - private Schema namedSchemaOrUnresolved(String fullName) { - Schema schema = names.get(fullName); - if (schema == null) { - schema = SchemaResolver.unresolvedSchema(fullName); - } - return schema; + public IdlReader(ParseContext parseContext) { + readLocations = new HashSet<>(); + this.parseContext = parseContext; } - private void setTypes(Map types) { - names.clear(); - for (Schema schema : types.values()) { - addSchema(schema); - } + private Schema namedSchemaOrUnresolved(String fullName) { + return parseContext.resolve(fullName); } - public void addTypes(Map types) { - for (Schema schema : types.values()) { - addSchema(schema); - } + private void addSchema(Schema schema) { + parseContext.put(schema); } - private void addSchema(Schema schema) { - String fullName = schema.getFullName(); - if (names.containsKey(fullName)) { - throw new SchemaParseException("Can't redefine: " + fullName); + public IdlFile resolve(IdlFile unresolved) { + Protocol protocol = unresolved.getProtocol(); + if (protocol == null) { + Schema mainSchema = SchemaResolver.resolve(parseContext, unresolved.getMainSchema()); + Iterable namedSchemas = SchemaResolver.resolve(parseContext, unresolved.getNamedSchemas().values()); + return new IdlFile(mainSchema, namedSchemas, unresolved.getWarnings()); + } else { + return new IdlFile(SchemaResolver.resolve(parseContext, protocol), unresolved.getWarnings()); } - names.put(fullName, schema); } public IdlFile parse(Path location) throws IOException { @@ -185,20 +186,28 @@ public IdlFile parse(Path location) throws IOException { } IdlFile parse(URI location) throws IOException { - try (InputStream stream = location.toURL().openStream()) { - readLocations.add(location); - URI inputDir = location; - if ("jar".equals(location.getScheme())) { - String jarUriAsString = location.toString(); - String pathFromJarRoot = jarUriAsString.substring(jarUriAsString.indexOf("!/") + 2); - inputDir = URI.create(CLASSPATH_SCHEME + ":/" + pathFromJarRoot); - } - inputDir = inputDir.resolve("."); + readLocations.add(location); + URI inputDir = location; + if ("jar".equals(location.getScheme())) { + String jarUriAsString = location.toString(); + String pathFromJarRoot = jarUriAsString.substring(jarUriAsString.indexOf("!/") + 2); + inputDir = URI.create(CLASSPATH_SCHEME + ":/" + pathFromJarRoot); + } + inputDir = inputDir.resolve("."); - return parse(inputDir, CharStreams.fromStream(stream, StandardCharsets.UTF_8)); + try (InputStream stream = location.toURL().openStream()) { + String inputString = UtfTextUtils.readAllBytes(stream, null); + return parse(inputDir, CharStreams.fromString(inputString)); } } + /** + * Parse an IDL file from a string, using the given directory for imports. + */ + public IdlFile parse(URI directory, CharSequence source) throws IOException { + return parse(directory, CharStreams.fromString(source.toString())); + } + /** * Parse an IDL file from a stream. This method cannot handle imports. */ @@ -219,8 +228,14 @@ private IdlFile parse(URI inputDir, CharStream charStream) { parser.setTrace(false); parser.setBuildParseTree(false); - // Trigger parsing. - parser.idlFile(); + try { + // Trigger parsing. + parser.idlFile(); + } catch (SchemaParseException e) { + throw e; + } catch (RuntimeException e) { + throw new SchemaParseException(e); + } return parseListener.getIdlFile(); } @@ -348,13 +363,11 @@ private void popNamespace() { @Override public void exitIdlFile(IdlFileContext ctx) { - IdlFile unresolved; if (protocol == null) { - unresolved = new IdlFile(currentNamespace(), mainSchema, getTypes().values(), warnings); + result = new IdlFile(mainSchema, parseContext.typesByName().values(), warnings); } else { - unresolved = new IdlFile(protocol, warnings); + result = new IdlFile(protocol, warnings); } - result = SchemaResolver.resolve(unresolved, OPTIONAL_NULLABLE_TYPE_PROPERTY); } @Override @@ -378,7 +391,7 @@ public void enterProtocolDeclarationBody(ProtocolDeclarationBodyContext ctx) { @Override public void exitProtocolDeclaration(ProtocolDeclarationContext ctx) { if (protocol != null) - protocol.setTypes(getTypes().values()); + protocol.setTypes(parseContext.typesByName().values()); if (!namespaces.isEmpty()) popNamespace(); } @@ -440,10 +453,8 @@ public void exitImportStatement(ImportStatementContext importContext) { break; case IdlParser.Schema: try (InputStream stream = importLocation.toURL().openStream()) { - Schema.Parser parser = new Schema.Parser(); - parser.addTypes(getTypes().values()); // inherit names - parser.parse(stream); - setTypes(parser.getTypes()); // update names + JsonSchemaParser parser = new JsonSchemaParser(); + parser.parse(parseContext, importLocation.resolve("."), UtfTextUtils.readAllBytes(stream, null)); } break; } @@ -671,18 +682,22 @@ private JsonNode fixDefaultValue(JsonNode defaultValue, Schema fieldType) { */ private Schema fixOptionalSchema(Schema schema, JsonNode defaultValue) { Object optionalType = schema.getObjectProp(OPTIONAL_NULLABLE_TYPE_PROPERTY); - if (optionalType != null) { - // The schema is a union schema with 2 types: "null" and a non-"null" schema - Schema nullSchema = schema.getTypes().get(0); - Schema nonNullSchema = schema.getTypes().get(1); - boolean nonNullDefault = defaultValue != null && !defaultValue.isNull(); - - // Note: the resolving visitor we'll use later drops the marker property. - if (nonNullDefault) { - return Schema.createUnion(nonNullSchema, nullSchema); - } + if (optionalType == null) { + return schema; + } + + // The schema is a union schema with 2 types: "null" and a non-"null" + // schema. The result of this method must not have the property + // OPTIONAL_NULLABLE_TYPE_PROPERTY. + Schema nullSchema = schema.getTypes().get(0); + Schema nonNullSchema = schema.getTypes().get(1); + boolean nonNullDefault = defaultValue != null && !defaultValue.isNull(); + + if (nonNullDefault) { + return Schema.createUnion(nonNullSchema, nullSchema); + } else { + return Schema.createUnion(nullSchema, nonNullSchema); } - return schema; } @Override diff --git a/lang/java/idl/src/main/java/org/apache/avro/idl/IdlSchemaParser.java b/lang/java/idl/src/main/java/org/apache/avro/idl/IdlSchemaParser.java new file mode 100644 index 00000000000..618ac6254a6 --- /dev/null +++ b/lang/java/idl/src/main/java/org/apache/avro/idl/IdlSchemaParser.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.avro.idl; + +import org.apache.avro.FormattedSchemaParser; +import org.apache.avro.ParseContext; +import org.apache.avro.Schema; +import org.apache.avro.SchemaParseException; + +import java.io.IOException; +import java.net.URI; +import java.util.regex.Pattern; + +public class IdlSchemaParser implements FormattedSchemaParser { + + @Override + public Schema parse(ParseContext parseContext, URI baseUri, CharSequence formattedSchema) + throws IOException, SchemaParseException { + boolean valid = Pattern.compile("^\\A*!" + // Initial whitespace + "(?:/\\*(?:[^*]|\\*[^/])*!\\*/\\s*!|//(!=\\R)*!\\R\\s*!)*!" + // Comments + "(?:namespace|schema|protocol|record|enum|fixed|import)\\s", // First keyword + Pattern.UNICODE_CHARACTER_CLASS | Pattern.MULTILINE).matcher(formattedSchema).find(); + if (valid) { + IdlReader idlReader = new IdlReader(parseContext); + IdlFile idlFile = idlReader.parse(baseUri, formattedSchema); + return idlFile.getMainSchema(); + } + return null; + } +} diff --git a/lang/java/idl/src/main/java/org/apache/avro/idl/IsResolvedSchemaVisitor.java b/lang/java/idl/src/main/java/org/apache/avro/idl/IsResolvedSchemaVisitor.java deleted file mode 100644 index 12fd5dbff21..00000000000 --- a/lang/java/idl/src/main/java/org/apache/avro/idl/IsResolvedSchemaVisitor.java +++ /dev/null @@ -1,60 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.avro.idl; - -import org.apache.avro.Schema; - -/** - * This visitor checks if the current schema is fully resolved. - */ -public final class IsResolvedSchemaVisitor implements SchemaVisitor { - boolean hasUnresolvedParts; - - IsResolvedSchemaVisitor() { - hasUnresolvedParts = false; - } - - @Override - public SchemaVisitorAction visitTerminal(Schema terminal) { - hasUnresolvedParts = SchemaResolver.isUnresolvedSchema(terminal); - return hasUnresolvedParts ? SchemaVisitorAction.TERMINATE : SchemaVisitorAction.CONTINUE; - } - - @Override - public SchemaVisitorAction visitNonTerminal(Schema nonTerminal) { - hasUnresolvedParts = SchemaResolver.isUnresolvedSchema(nonTerminal); - if (hasUnresolvedParts) { - return SchemaVisitorAction.TERMINATE; - } - if (nonTerminal.getType() == Schema.Type.RECORD && !nonTerminal.hasFields()) { - // We're still initializing the type... - return SchemaVisitorAction.SKIP_SUBTREE; - } - return SchemaVisitorAction.CONTINUE; - } - - @Override - public SchemaVisitorAction afterVisitNonTerminal(Schema nonTerminal) { - return SchemaVisitorAction.CONTINUE; - } - - @Override - public Boolean get() { - return !hasUnresolvedParts; - } -} diff --git a/lang/java/idl/src/main/java/org/apache/avro/idl/ResolvingVisitor.java b/lang/java/idl/src/main/java/org/apache/avro/idl/ResolvingVisitor.java deleted file mode 100644 index 04e41f3403a..00000000000 --- a/lang/java/idl/src/main/java/org/apache/avro/idl/ResolvingVisitor.java +++ /dev/null @@ -1,192 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.avro.idl; - -import org.apache.avro.AvroTypeException; -import org.apache.avro.Schema; -import org.apache.avro.Schema.Field; - -import java.util.ArrayList; -import java.util.Arrays; -import java.util.EnumSet; -import java.util.HashSet; -import java.util.IdentityHashMap; -import java.util.List; -import java.util.Optional; -import java.util.Set; -import java.util.function.Function; - -import static org.apache.avro.Schema.Type.ARRAY; -import static org.apache.avro.Schema.Type.ENUM; -import static org.apache.avro.Schema.Type.FIXED; -import static org.apache.avro.Schema.Type.MAP; -import static org.apache.avro.Schema.Type.RECORD; -import static org.apache.avro.Schema.Type.UNION; - -/** - * This visitor creates clone of the visited Schemata, minus the specified - * schema properties, and resolves all unresolved schemas. - */ -public final class ResolvingVisitor implements SchemaVisitor { - private static final Set CONTAINER_SCHEMA_TYPES = EnumSet.of(RECORD, ARRAY, MAP, UNION); - private static final Set NAMED_SCHEMA_TYPES = EnumSet.of(RECORD, ENUM, FIXED); - - private final Function symbolTable; - private final Set schemaPropertiesToRemove; - private final IdentityHashMap replace; - - private final Schema root; - - public ResolvingVisitor(final Schema root, final Function symbolTable, - String... schemaPropertiesToRemove) { - this(root, symbolTable, new HashSet<>(Arrays.asList(schemaPropertiesToRemove))); - } - - public ResolvingVisitor(final Schema root, final Function symbolTable, - Set schemaPropertiesToRemove) { - this.replace = new IdentityHashMap<>(); - this.symbolTable = symbolTable; - this.schemaPropertiesToRemove = schemaPropertiesToRemove; - - this.root = root; - } - - public ResolvingVisitor withRoot(Schema root) { - return new ResolvingVisitor(root, symbolTable, schemaPropertiesToRemove); - } - - @Override - public SchemaVisitorAction visitTerminal(final Schema terminal) { - Schema.Type type = terminal.getType(); - Schema newSchema; - if (CONTAINER_SCHEMA_TYPES.contains(type)) { - if (!replace.containsKey(terminal)) { - throw new IllegalStateException("Schema " + terminal + " must be already processed"); - } - return SchemaVisitorAction.CONTINUE; - } else if (type == ENUM) { - newSchema = Schema.createEnum(terminal.getName(), terminal.getDoc(), terminal.getNamespace(), - terminal.getEnumSymbols(), terminal.getEnumDefault()); - } else if (type == FIXED) { - newSchema = Schema.createFixed(terminal.getName(), terminal.getDoc(), terminal.getNamespace(), - terminal.getFixedSize()); - } else { - newSchema = Schema.create(type); - } - copyProperties(terminal, newSchema); - replace.put(terminal, newSchema); - return SchemaVisitorAction.CONTINUE; - } - - public void copyProperties(final Schema first, final Schema second) { - // Logical type - Optional.ofNullable(first.getLogicalType()).ifPresent(logicalType -> logicalType.addToSchema(second)); - - // Aliases (if applicable) - if (NAMED_SCHEMA_TYPES.contains(first.getType())) { - first.getAliases().forEach(second::addAlias); - } - - // Other properties - first.getObjectProps().forEach((name, value) -> { - if (!schemaPropertiesToRemove.contains(name)) { - second.addProp(name, value); - } - }); - } - - @Override - public SchemaVisitorAction visitNonTerminal(final Schema nt) { - Schema.Type type = nt.getType(); - if (type == RECORD) { - if (SchemaResolver.isUnresolvedSchema(nt)) { - // unresolved schema will get a replacement that we already encountered, - // or we will attempt to resolve. - final String unresolvedSchemaName = SchemaResolver.getUnresolvedSchemaName(nt); - Schema resSchema = symbolTable.apply(unresolvedSchemaName); - if (resSchema == null) { - throw new AvroTypeException("Unable to resolve " + unresolvedSchemaName); - } - Schema replacement = replace.computeIfAbsent(resSchema, schema -> { - Schemas.visit(schema, this); - return replace.get(schema); - }); - replace.put(nt, replacement); - } else { - // create a fieldless clone. Fields will be added in afterVisitNonTerminal. - Schema newSchema = Schema.createRecord(nt.getName(), nt.getDoc(), nt.getNamespace(), nt.isError()); - copyProperties(nt, newSchema); - replace.put(nt, newSchema); - } - } - return SchemaVisitorAction.CONTINUE; - } - - @Override - public SchemaVisitorAction afterVisitNonTerminal(final Schema nt) { - Schema.Type type = nt.getType(); - Schema newSchema; - switch (type) { - case RECORD: - if (!SchemaResolver.isUnresolvedSchema(nt)) { - newSchema = replace.get(nt); - // Check if we've already handled the replacement schema with a - // reentrant call to visit(...) from within the visitor. - if (!newSchema.hasFields()) { - List fields = nt.getFields(); - List newFields = new ArrayList<>(fields.size()); - for (Schema.Field field : fields) { - newFields.add(new Field(field, replace.get(field.schema()))); - } - newSchema.setFields(newFields); - } - } - return SchemaVisitorAction.CONTINUE; - case UNION: - List types = nt.getTypes(); - List newTypes = new ArrayList<>(types.size()); - for (Schema sch : types) { - newTypes.add(replace.get(sch)); - } - newSchema = Schema.createUnion(newTypes); - break; - case ARRAY: - newSchema = Schema.createArray(replace.get(nt.getElementType())); - break; - case MAP: - newSchema = Schema.createMap(replace.get(nt.getValueType())); - break; - default: - throw new IllegalStateException("Illegal type " + type + ", schema " + nt); - } - copyProperties(nt, newSchema); - replace.put(nt, newSchema); - return SchemaVisitorAction.CONTINUE; - } - - @Override - public Schema get() { - return replace.get(root); - } - - @Override - public String toString() { - return "ResolvingVisitor{symbolTable=" + symbolTable + ", schemaPropertiesToRemove=" + schemaPropertiesToRemove - + ", replace=" + replace + '}'; - } -} diff --git a/lang/java/idl/src/main/java/org/apache/avro/idl/SchemaResolver.java b/lang/java/idl/src/main/java/org/apache/avro/idl/SchemaResolver.java deleted file mode 100644 index 8c9a9c15b99..00000000000 --- a/lang/java/idl/src/main/java/org/apache/avro/idl/SchemaResolver.java +++ /dev/null @@ -1,149 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.avro.idl; - -import org.apache.avro.JsonProperties; -import org.apache.avro.Protocol; -import org.apache.avro.Schema; - -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Function; -import java.util.stream.Collectors; - -/** - * Utility class to resolve schemas that are unavailable at the point they are - * referenced in the IDL. - */ -final class SchemaResolver { - - private SchemaResolver() { - } - - private static final String UR_SCHEMA_ATTR = "org.apache.avro.idl.unresolved.name"; - - private static final String UR_SCHEMA_NAME = "UnresolvedSchema"; - - private static final String UR_SCHEMA_NS = "org.apache.avro.compiler"; - - private static final AtomicInteger COUNTER = new AtomicInteger(); - - /** - * Create a schema to represent an "unresolved" schema. (used to represent a - * schema whose definition does not exist, yet). - * - * @param name a schema name - * @return an unresolved schema for the given name - */ - static Schema unresolvedSchema(final String name) { - Schema schema = Schema.createRecord(UR_SCHEMA_NAME + '_' + COUNTER.getAndIncrement(), "unresolved schema", - UR_SCHEMA_NS, false, Collections.emptyList()); - schema.addProp(UR_SCHEMA_ATTR, name); - return schema; - } - - /** - * Is this an unresolved schema. - * - * @param schema a schema - * @return whether the schema is an unresolved schema - */ - static boolean isUnresolvedSchema(final Schema schema) { - return (schema.getType() == Schema.Type.RECORD && schema.getProp(UR_SCHEMA_ATTR) != null && schema.getName() != null - && schema.getName().startsWith(UR_SCHEMA_NAME) && UR_SCHEMA_NS.equals(schema.getNamespace())); - } - - /** - * Get the unresolved schema name. - * - * @param schema an unresolved schema - * @return the name of the unresolved schema - */ - static String getUnresolvedSchemaName(final Schema schema) { - if (!isUnresolvedSchema(schema)) { - throw new IllegalArgumentException("Not a unresolved schema: " + schema); - } - return schema.getProp(UR_SCHEMA_ATTR); - } - - /** - * Is this an unresolved schema? - */ - static boolean isFullyResolvedSchema(final Schema schema) { - if (isUnresolvedSchema(schema)) { - return false; - } else { - return Schemas.visit(schema, new IsResolvedSchemaVisitor()); - } - } - - /** - * Clone all provided schemas while resolving all unreferenced schemas. - * - * @param idlFile a parsed IDL file - * @return a copy of idlFile with all schemas resolved - */ - static IdlFile resolve(final IdlFile idlFile, String... schemaPropertiesToRemove) { - if (idlFile.getProtocol() != null) { - return new IdlFile(resolve(idlFile.getProtocol(), schemaPropertiesToRemove), idlFile.getWarnings()); - } - - ResolvingVisitor visitor = new ResolvingVisitor(null, idlFile::getNamedSchema, schemaPropertiesToRemove); - Function resolver = schema -> Schemas.visit(schema, visitor.withRoot(schema)); - - List namedSchemata = idlFile.getNamedSchemas().values().stream().map(resolver).collect(Collectors.toList()); - Schema mainSchema = Optional.ofNullable(idlFile.getMainSchema()).map(resolver).orElse(null); - return new IdlFile(idlFile.getNamespace(), mainSchema, namedSchemata, idlFile.getWarnings()); - } - - /** - * Will clone the provided protocol while resolving all unreferenced schemas - * - * @param protocol a parsed protocol - * @return a copy of the protocol with all schemas resolved - */ - static Protocol resolve(final Protocol protocol, String... schemaPropertiesToRemove) { - // Create an empty copy of the protocol - Protocol result = new Protocol(protocol.getName(), protocol.getDoc(), protocol.getNamespace()); - protocol.getObjectProps().forEach(((JsonProperties) result)::addProp); - - ResolvingVisitor visitor = new ResolvingVisitor(null, protocol::getType, schemaPropertiesToRemove); - Function resolver = schema -> Schemas.visit(schema, visitor.withRoot(schema)); - - // Resolve all schemata in the protocol. - result.setTypes(protocol.getTypes().stream().map(resolver).collect(Collectors.toList())); - Map resultMessages = result.getMessages(); - protocol.getMessages().forEach((name, oldValue) -> { - Protocol.Message newValue; - if (oldValue.isOneWay()) { - newValue = result.createMessage(oldValue.getName(), oldValue.getDoc(), oldValue, - resolver.apply(oldValue.getRequest())); - } else { - Schema request = resolver.apply(oldValue.getRequest()); - Schema response = resolver.apply(oldValue.getResponse()); - Schema errors = resolver.apply(oldValue.getErrors()); - newValue = result.createMessage(oldValue.getName(), oldValue.getDoc(), oldValue, request, response, errors); - } - resultMessages.put(name, newValue); - }); - return result; - } -} diff --git a/lang/java/idl/src/main/java/org/apache/avro/idl/SchemaVisitorAction.java b/lang/java/idl/src/main/java/org/apache/avro/idl/SchemaVisitorAction.java deleted file mode 100644 index 6aed09b3d32..00000000000 --- a/lang/java/idl/src/main/java/org/apache/avro/idl/SchemaVisitorAction.java +++ /dev/null @@ -1,40 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * https://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.avro.idl; - -public enum SchemaVisitorAction { - - /** - * continue visit. - */ - CONTINUE, - /** - * terminate visit. - */ - TERMINATE, - /** - * when returned from pre non terminal visit method the children of the non - * terminal are skipped. afterVisitNonTerminal for the current schema will not - * be invoked. - */ - SKIP_SUBTREE, - /** - * Skip visiting the siblings of this schema. - */ - SKIP_SIBLINGS -} diff --git a/lang/java/idl/src/main/resources/META-INF/services/org.apache.avro.FormattedSchemaParser b/lang/java/idl/src/main/resources/META-INF/services/org.apache.avro.FormattedSchemaParser new file mode 100644 index 00000000000..acb4986e419 --- /dev/null +++ b/lang/java/idl/src/main/resources/META-INF/services/org.apache.avro.FormattedSchemaParser @@ -0,0 +1,18 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +org.apache.avro.idl.IdlSchemaParser diff --git a/lang/java/idl/src/test/java/org/apache/avro/idl/IdlReaderTest.java b/lang/java/idl/src/test/java/org/apache/avro/idl/IdlReaderTest.java index 8e9f187f4ce..9b503bcb2a5 100644 --- a/lang/java/idl/src/test/java/org/apache/avro/idl/IdlReaderTest.java +++ b/lang/java/idl/src/test/java/org/apache/avro/idl/IdlReaderTest.java @@ -97,7 +97,6 @@ public void validateProtocolParsingResult() throws IOException { assertEquals(1, idlFile.getNamedSchemas().size()); idlFile.getNamedSchemas().keySet().forEach(System.out::println); assertNotNull(idlFile.getNamedSchema("communication.Message")); - assertNotNull(idlFile.getNamedSchema("Message")); assertNotNull(idlFile.getProtocol()); assertNull(idlFile.getMainSchema()); @@ -111,12 +110,11 @@ public void validateSchemaParsingResult() throws IOException { assertEquals(1, idlFile.getNamedSchemas().size()); idlFile.getNamedSchemas().keySet().forEach(System.out::println); assertNotNull(idlFile.getNamedSchema("communication.Message")); - assertNotNull(idlFile.getNamedSchema("Message")); assertNull(idlFile.getProtocol()); Schema mainSchema = idlFile.getMainSchema(); assertEquals(Schema.Type.ARRAY, mainSchema.getType()); - assertEquals(idlFile.getNamedSchema("Message"), mainSchema.getElementType()); + assertEquals(idlFile.getNamedSchema("communication.Message"), mainSchema.getElementType()); } @Test diff --git a/lang/java/idl/src/test/java/org/apache/avro/idl/TestCycle.java b/lang/java/idl/src/test/java/org/apache/avro/idl/TestCycle.java index 427de0957d6..a3c94ff025d 100644 --- a/lang/java/idl/src/test/java/org/apache/avro/idl/TestCycle.java +++ b/lang/java/idl/src/test/java/org/apache/avro/idl/TestCycle.java @@ -44,26 +44,27 @@ public class TestCycle { @Test public void testCycleGeneration() throws IOException, URISyntaxException { final ClassLoader cl = Thread.currentThread().getContextClassLoader(); - IdlFile idlFile = new IdlReader().parse(requireNonNull(cl.getResource("input/cycle.avdl")).toURI()); + IdlReader parser = new IdlReader(); + IdlFile idlFile = parser.resolve(parser.parse(requireNonNull(cl.getResource("input/cycle.avdl")).toURI())); String json = idlFile.outputString(); LOG.info(json); - GenericRecordBuilder rb2 = new GenericRecordBuilder(idlFile.getNamedSchema("SampleNode")); + GenericRecordBuilder rb2 = new GenericRecordBuilder(idlFile.getNamedSchema("org.apache.avro.gen.SampleNode")); rb2.set("count", 10); rb2.set("subNodes", Collections.EMPTY_LIST); GenericData.Record node = rb2.build(); - GenericRecordBuilder mb = new GenericRecordBuilder(idlFile.getNamedSchema("Method")); + GenericRecordBuilder mb = new GenericRecordBuilder(idlFile.getNamedSchema("org.apache.avro.gen.Method")); mb.set("declaringClass", "Test"); mb.set("methodName", "test"); GenericData.Record method = mb.build(); - GenericRecordBuilder spb = new GenericRecordBuilder(idlFile.getNamedSchema("SamplePair")); + GenericRecordBuilder spb = new GenericRecordBuilder(idlFile.getNamedSchema("org.apache.avro.gen.SamplePair")); spb.set("method", method); spb.set("node", node); GenericData.Record sp = spb.build(); - GenericRecordBuilder rb = new GenericRecordBuilder(idlFile.getNamedSchema("SampleNode")); + GenericRecordBuilder rb = new GenericRecordBuilder(idlFile.getNamedSchema("org.apache.avro.gen.SampleNode")); rb.set("count", 10); rb.set("subNodes", Collections.singletonList(sp)); GenericData.Record record = rb.build(); diff --git a/lang/java/ipc/src/test/java/org/apache/avro/TestSchema.java b/lang/java/ipc/src/test/java/org/apache/avro/TestSchema.java index d85b28effa3..ab35a6d837a 100644 --- a/lang/java/ipc/src/test/java/org/apache/avro/TestSchema.java +++ b/lang/java/ipc/src/test/java/org/apache/avro/TestSchema.java @@ -205,9 +205,9 @@ void record(TestInfo testInfo) throws Exception { @Test void invalidNameTolerance() { - new Schema.Parser(Schema.NameValidator.NO_VALIDATION).parse("{\"type\":\"record\",\"name\":\"1X\",\"fields\":[]}"); - new Schema.Parser(Schema.NameValidator.NO_VALIDATION).parse("{\"type\":\"record\",\"name\":\"X-\",\"fields\":[]}"); - new Schema.Parser(Schema.NameValidator.NO_VALIDATION).parse("{\"type\":\"record\",\"name\":\"X$\",\"fields\":[]}"); + new Schema.Parser(NameValidator.NO_VALIDATION).parse("{\"type\":\"record\",\"name\":\"1X\",\"fields\":[]}"); + new Schema.Parser(NameValidator.NO_VALIDATION).parse("{\"type\":\"record\",\"name\":\"X-\",\"fields\":[]}"); + new Schema.Parser(NameValidator.NO_VALIDATION).parse("{\"type\":\"record\",\"name\":\"X$\",\"fields\":[]}"); } @Test diff --git a/lang/java/maven-plugin/src/main/java/org/apache/avro/mojo/IDLMojo.java b/lang/java/maven-plugin/src/main/java/org/apache/avro/mojo/IDLMojo.java index 15f6a6c0c4e..d6db625d1de 100644 --- a/lang/java/maven-plugin/src/main/java/org/apache/avro/mojo/IDLMojo.java +++ b/lang/java/maven-plugin/src/main/java/org/apache/avro/mojo/IDLMojo.java @@ -18,21 +18,20 @@ package org.apache.avro.mojo; -import java.io.File; -import java.io.IOException; -import java.net.URL; -import java.net.URLClassLoader; -import java.util.ArrayList; -import java.util.List; - import org.apache.avro.Protocol; import org.apache.avro.compiler.specific.SpecificCompiler; import org.apache.avro.generic.GenericData; - import org.apache.avro.idl.IdlFile; import org.apache.avro.idl.IdlReader; import org.apache.maven.artifact.DependencyResolutionRequiredException; +import java.io.File; +import java.io.IOException; +import java.net.URL; +import java.net.URLClassLoader; +import java.util.ArrayList; +import java.util.List; + /** * Generate Java classes and interfaces from AvroIDL files (.avdl) * @@ -89,6 +88,7 @@ protected void doCompile(String filename, File sourceDirectory, File outputDirec for (String warning : idlFile.getWarnings()) { getLog().warn(warning); } + idlFile = parser.resolve(idlFile); final SpecificCompiler compiler; final Protocol protocol = idlFile.getProtocol(); if (protocol != null) { diff --git a/lang/java/pom.xml b/lang/java/pom.xml index f51f4df933b..c439548f835 100644 --- a/lang/java/pom.xml +++ b/lang/java/pom.xml @@ -314,6 +314,8 @@ import\s+[^\*\s]+\*;(\r\n|\r|\n) $1 + + diff --git a/lang/java/tools/src/main/java/org/apache/avro/tool/IdlTool.java b/lang/java/tools/src/main/java/org/apache/avro/tool/IdlTool.java index d20226b8e77..6ef82714678 100644 --- a/lang/java/tools/src/main/java/org/apache/avro/tool/IdlTool.java +++ b/lang/java/tools/src/main/java/org/apache/avro/tool/IdlTool.java @@ -25,9 +25,9 @@ import org.apache.avro.idl.IdlReader; import java.io.File; -import java.io.FileOutputStream; import java.io.InputStream; import java.io.PrintStream; +import java.nio.file.Files; import java.util.List; /** @@ -56,8 +56,9 @@ public int run(InputStream in, PrintStream out, PrintStream err, List ar File outputFile = "-".equals(outputName) ? null : new File(outputName); Schema m = null; - Protocol p = null; + Protocol p; if (useJavaCC) { + // noinspection deprecation try (Idl parser = new Idl(inputFile)) { p = parser.CompilationUnit(); for (String warning : parser.getWarningsAfterParsing()) { @@ -70,13 +71,14 @@ public int run(InputStream in, PrintStream out, PrintStream err, List ar for (String warning : idlFile.getWarnings()) { err.println("Warning: " + warning); } + idlFile = parser.resolve(idlFile); p = idlFile.getProtocol(); m = idlFile.getMainSchema(); } PrintStream parseOut = out; if (outputFile != null) { - parseOut = new PrintStream(new FileOutputStream(outputFile)); + parseOut = new PrintStream(Files.newOutputStream(outputFile.toPath())); } if (m == null && p == null) {