opensearch-project · omkarmmore95 · Jul 31, 2023 · Aug 1, 2023 · Aug 2, 2023 · Aug 8, 2023
@@ -34,6 +34,11 @@ pipeline:
                 "     {\"name\": \"name\", \"type\": \"string\"}," +
                 "     {\"name\": \"age\", \"type\": \"int\"}]" +
                 "}";
+            tabular_schema: |
+              TABLE Person (colname datatype,
+                            colname2 datatype,
+              			    colname3 datatype,
+              			    colname4 datatype)
             exclude_keys:
               - s3
         buffer_type: in_memory
@@ -45,6 +50,7 @@ pipeline:
 
 1) `schema`: A json string that user can provide in the yaml file itself. The codec parses schema object from this schema string. 
 2) `exclude_keys`: Those keys of the events that the user wants to exclude while converting them to avro records.
+3) `tabular_schema`: A multiline schema string like glue schema string that user can provide in the yaml file itself. The codec build schema object from this schema string.
 
 ### Note:
 

@@ -57,6 +57,8 @@ public void start(final OutputStream outputStream, final Event event, final Stri
         Objects.requireNonNull(outputStream);
         if (config.getSchema() != null) {
             schema = parseSchema(config.getSchema());
+        } else if(config.getTabularSchemaString() != null){
+            schema = AvroSchemaParserFromTabularFormat.generateSchemaFromTabularString(config.getTabularSchemaString());
         } else if (config.getFileLocation() != null) {
             schema = AvroSchemaParser.parseSchemaFromJsonFile(config.getFileLocation());
         } else if (config.getSchemaRegistryUrl() != null) {

@@ -45,6 +45,9 @@ public class AvroOutputCodecConfig {
     @JsonProperty("file_key")
     private String fileKey;
 
+    @JsonProperty("tabular_schema")
+    private String tabularSchemaString;
+
     public List<String> getExcludeKeys() {
         return excludeKeys;
     }
@@ -80,4 +83,7 @@ public void setExcludeKeys(List<String> excludeKeys) {
         this.excludeKeys = excludeKeys;
     }
 
+    public String getTabularSchemaString() {
+        return tabularSchemaString;
+    }
 }
@@ -0,0 +1,194 @@
+package org.opensearch.dataprepper.plugins.codec.avro;
+
+import org.apache.avro.Schema;
+
+import java.io.IOException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class AvroSchemaParserFromTabularFormat {
+
+    private static final String END_SCHEMA_STRING = "]}";
+    private static final Pattern pattern = Pattern.compile("\\(([^()]*|)*\\)");
+
+    static Schema generateSchemaFromTabularString(final String inputString) throws IOException {
+        String recordSchemaOutputString = inputString;
+        recordSchemaOutputString = recordSchemaOutputString.trim();
+
+        final String tableName = extractTableName(recordSchemaOutputString.split("\\s+"));
+        recordSchemaOutputString = getStringFromParanthesis(recordSchemaOutputString);
+        recordSchemaOutputString = removeSpaceAndReplaceParanthesis(recordSchemaOutputString);
+
+        final StringBuilder mainSchemaBuilder = new StringBuilder();
+        final String baseSchemaStr = "{\"type\":\"record\",\"name\":\"" + tableName + "\",\"fields\":[";
+        mainSchemaBuilder.append(baseSchemaStr);
+
+        iterateRecursively(mainSchemaBuilder, recordSchemaOutputString, false, tableName, 0);
+
+        mainSchemaBuilder.append(END_SCHEMA_STRING);
+
+        return new org.apache.avro.Schema.Parser().parse(mainSchemaBuilder.toString());
+    }
+
+    private static String removeSpaceAndReplaceParanthesis(String recordSchemaOutputString) {
+        recordSchemaOutputString = recordSchemaOutputString.replaceAll("\\(", "");
+        recordSchemaOutputString = recordSchemaOutputString.replaceAll("\\)", "");
+
+        recordSchemaOutputString = recordSchemaOutputString.trim();
+
+        recordSchemaOutputString = recordSchemaOutputString.replaceAll("\\n", "");
+        recordSchemaOutputString = recordSchemaOutputString.replaceAll(",\\s+", ",");
+        recordSchemaOutputString = recordSchemaOutputString.replaceAll(">\\s+", ">");
+        recordSchemaOutputString = recordSchemaOutputString.replaceAll("<\\s+", "<");
+        return recordSchemaOutputString;
+    }
+
+    private static String getStringFromParanthesis(String recordSchemaOutputString) {
+        final Matcher matcher = pattern.matcher(recordSchemaOutputString);
+        if (matcher.find()) {
+            recordSchemaOutputString = matcher.group(0);
+        }
+        return recordSchemaOutputString;
+    }
+
+    private static String extractTableName(final String[] words) throws IOException {
+        String tableName = null;
+        if (words.length >= 2) {
+            tableName = words[1];
+        } else {
+            throw new IOException("Invalid schema string.");
+        }
+        return tableName;
+    }
+
+    private static String buildBaseSchemaString(final String part) {
+        return "{\"type\":\"record\",\"name\":\"" + part + "\",\"fields\":[";
+    }
+
+    private static String buildSchemaStringForArr() {
+        return "{\"type\":\"array\", \"items\":\"string\"}";
+    }
+
+    public static void iterateRecursively(final StringBuilder mainSchemaBuilder, final String recordSchema,
+                                          final boolean isStructString, final String tableName, int innerRecCounter) {
+        boolean isNameStringFormed = false;
+        StringBuilder fieldNameBuilder = new StringBuilder();
+        StringBuilder fieldTypeBuilder = new StringBuilder();
+        boolean isFirstRecordForName = true;
+
+        final char[] schemaStrCharArr = recordSchema.toCharArray();
+        int curPosInSchemaStrCharArr = 0;
+
+        while (curPosInSchemaStrCharArr < schemaStrCharArr.length) {
+            final char currentCharFromArr = schemaStrCharArr[curPosInSchemaStrCharArr];
+            curPosInSchemaStrCharArr++;
+
+            if (!isNameStringFormed) {
+                if (isStructString && currentCharFromArr == ':') {
+                    if (isFirstRecordForName) {
+                        mainSchemaBuilder.append("{\"name\":\"" + fieldNameBuilder.toString() + "\",\"type\":\"");
+                    } else {
+                        mainSchemaBuilder.append(",{\"name\":\"" + fieldNameBuilder.toString() + "\",\"type\":\"");
+                    }
+                    isNameStringFormed = true;
+                    fieldNameBuilder = new StringBuilder();
+                    isFirstRecordForName = false;
+                    continue;
+                } else if (currentCharFromArr == ' ') {
+                    if (isFirstRecordForName) {
+                        mainSchemaBuilder.append("{\"name\":\"" + fieldNameBuilder.toString() + "\",\"type\":\"");
+                    } else {
+                        mainSchemaBuilder.append(",{\"name\":\"" + fieldNameBuilder.toString() + "\",\"type\":\"");
+                    }
+                    isNameStringFormed = true;
+                    fieldNameBuilder = new StringBuilder();
+                    isFirstRecordForName = false;
+                    continue;
+                }
+                fieldNameBuilder.append(currentCharFromArr);
+            }
+
+            if (isNameStringFormed) {
+
+                if (currentCharFromArr == ',' || curPosInSchemaStrCharArr == schemaStrCharArr.length) {
+                    if (curPosInSchemaStrCharArr == schemaStrCharArr.length) {
+                        fieldTypeBuilder.append(currentCharFromArr);
+                    }
+                    final String type = fieldTypeBuilder.toString().trim() + "\"}";
+
+                    mainSchemaBuilder.append(type);
+                    isNameStringFormed = false;
+                    fieldTypeBuilder = new StringBuilder();
+                    continue;
+                }
+
+                fieldTypeBuilder.append(currentCharFromArr);
+                if ("struct".equals(fieldTypeBuilder.toString())) {
+                    mainSchemaBuilder.deleteCharAt(mainSchemaBuilder.length() - 1);
+                    mainSchemaBuilder.append(buildBaseSchemaString(tableName + "_" + innerRecCounter));
+                    final String structSchemaStr = recordSchema.substring(curPosInSchemaStrCharArr);
+                    final StringBuilder structString = new StringBuilder();
+                    int openClosedCounter = 0;
+                    int structSchemaStrEndBracketPos = 0;
+                    for (final char innerChar : structSchemaStr.toCharArray()) {
+                        structSchemaStrEndBracketPos++;
+                        if (innerChar == '<') {
+                            openClosedCounter++;
+                        } else if (innerChar == '>') {
+                            openClosedCounter--;
+                        }
+                        structString.append(innerChar);
+                        if (openClosedCounter == 0) {
+                            break;
+                        }
+                    }
+
+                    final String innerRecord = structString.toString().substring(1, structSchemaStrEndBracketPos - 1);
+                    iterateRecursively(mainSchemaBuilder, innerRecord, true,
+                            tableName, innerRecCounter + 1);
+                    mainSchemaBuilder.append("}");
+                    curPosInSchemaStrCharArr = curPosInSchemaStrCharArr + structSchemaStrEndBracketPos;
+                    if (curPosInSchemaStrCharArr < schemaStrCharArr.length) {
+                        // Skip one comma after the close struct close
+                        curPosInSchemaStrCharArr++;
+                    }
+                    isNameStringFormed = false;
+                    fieldTypeBuilder = new StringBuilder();
+                } else if ("array".equals(fieldTypeBuilder.toString())) {
+                    mainSchemaBuilder.deleteCharAt(mainSchemaBuilder.length() - 1);
+                    mainSchemaBuilder.append(buildSchemaStringForArr());
+                    final String structSchemaStr = recordSchema.substring(curPosInSchemaStrCharArr);
+                    int openClosedCounter = 0;
+                    int structSchemaStrEndBracketPos = 0;
+
+                    for (final char innerChar : structSchemaStr.toCharArray()) {
+                        structSchemaStrEndBracketPos++;
+                        if (innerChar == '<') {
+                            openClosedCounter++;
+                        } else if (innerChar == '>') {
+                            openClosedCounter--;
+                        }
+                        if (openClosedCounter == 0) {
+                            break;
+                        }
+                    }
+
+                    mainSchemaBuilder.append("}");
+                    curPosInSchemaStrCharArr = curPosInSchemaStrCharArr + structSchemaStrEndBracketPos;
+                    if (curPosInSchemaStrCharArr < schemaStrCharArr.length) {
+                        // Skip one comma after the close struct close
+                        curPosInSchemaStrCharArr++;
+                    }
+
+                    isNameStringFormed = false;
+                    fieldTypeBuilder = new StringBuilder();
+                }
+            }
+        }
+
+        if (isStructString) {
+            mainSchemaBuilder.append(END_SCHEMA_STRING);
+        }
+    }
+
+}
@@ -35,6 +35,18 @@ public class AvroOutputCodecTest {
             ":\"name\",\"type\":\"string\"},{\"name\":\"nestedRecord\",\"type\":{\"type\":\"record\",\"name\":" +
             "\"NestedRecord1\",\"fields\":[{\"name\":\"secondFieldInNestedRecord\",\"type\":\"int\"},{\"name\":\"" +
             "firstFieldInNestedRecord\",\"type\":\"string\"}]}},{\"name\":\"age\",\"type\":\"int\"}]}";
+    private static final String getExpectedSchemaStringTabular="{\"type\":\"record\",\"name\":\"sesblog\",\"fields\":[{\"name\"" +
+            ":\"eventType\",\"type\":\"string\"},{\"name\":\"ews\",\"type\":\"string\"},{\"name\":\"mail\",\"type\":" +
+            "{\"type\":\"record\",\"name\":\"sesblog_0\",\"fields\":[{\"name\":\"col1\",\"type\":\"string\"},{\"name\":\"" +
+            "innercolName\",\"type\":{\"type\":\"record\",\"name\":\"sesblog_1\",\"fields\":[{\"name\":\"colInner\"," +
+            "\"type\":\"string\"}]}}]}},{\"name\":\"collumn2\",\"type\":\"string\"}]}";
+    private static final String inputString = "TABLE sesblog (\n" +
+            "  eventType string,\n" +
+            "  ews string,\n" +
+            "  mail struct<col1:string,\n" +
+            "              innercolName struct<colInner:string> \n" +
+            "              >,\n" +
+            "  collumn2 string) ";
     private AvroOutputCodecConfig config;
 
     private ByteArrayOutputStream outputStream;
@@ -174,4 +186,11 @@ private static Map<String, Object> convertRecordToMap(GenericRecord nestedRecord
         }
         return eventData;
     }
+
+    @Test
+    public void testTabularSchemaParser() throws IOException {
+        Schema expectedSchema = new Schema.Parser().parse(getExpectedSchemaStringTabular);
+        Schema actualSchema=AvroSchemaParserFromTabularFormat.generateSchemaFromTabularString(inputString);
+        assertThat(actualSchema, Matchers.equalTo(expectedSchema));
+    }
 }
@@ -27,6 +27,11 @@ pipeline:
           event_collect_timeout: 15s
        codec:
           parquet:
+           tabular_schema: |
+              TABLE Person (colname1 datatype,
+                            colname2 datatype,
+              			    colname3 datatype,
+              			    colname4 datatype)
            schema: "{\"namespace\": \"org.example.test\"," +
                 " \"type\": \"record\"," +
                 " \"name\": \"TestMessage\"," +
@@ -58,7 +63,7 @@ pipeline:
 8) `schema_bucket`: Name of the S3 bucket in which `schema.json` file is kept.
 9) `file_key`: File key of `schema.json` file kept in S3 bucket.
 10) `schema_region`: AWS Region of the S3 bucket in which `schema.json` file is kept.
-
+11) `tabular_schema`: A multiline schema string like glue schema string that user can provide in the yaml file itself. The codec build schema object from this schema string.
 ### Note:
 
 1) User can provide only one schema at a time i.e. through either of the ways provided in codec config.

@@ -84,6 +84,8 @@ public synchronized void start(File file) throws IOException {
     void buildSchemaAndKey(final Event event, final String tagsTargetKey) throws IOException {
         if (config.getSchema() != null) {
             schema = parseSchema(config.getSchema());
+        } else if(config.getTabularSchema() != null){
+            schema = ParquetSchemaParserFromTabularFormat.generateSchemaFromTabularString(config.getTabularSchema());
         } else if(config.getFileLocation()!=null){
             schema = ParquetSchemaParser.parseSchemaFromJsonFile(config.getFileLocation());
         }else if(config.getSchemaRegistryUrl()!=null){
@@ -284,7 +286,7 @@ private static Object schemaMapper(final Schema.Field field , final Object rawVa
                     LOG.error("Unrecognised Field name : '{}' & type : '{}'", field.name(), fieldType);
                     break;
             }
-        }else{
+        }else if(field.schema().getLogicalType() != null) {
             final String logicalTypeName = field.schema().getLogicalType().getName();
             switch (logicalTypeName){
                 case "date":
@@ -309,12 +311,11 @@ private static Object schemaMapper(final Schema.Field field , final Object rawVa
         }
         return  finalValue;
     }
-    boolean checkS3SchemaValidity() throws IOException {
+    boolean checkS3SchemaValidity() {
         if (config.getSchemaBucket() != null && config.getFileKey() != null && config.getSchemaRegion() != null) {
             return true;
         } else {
-            LOG.error("Invalid S3 credentials, can't reach the schema file.");
-            throw new IOException("Can't proceed without schema.");
+            return false;
         }
     }
 }
@@ -57,6 +57,9 @@ public class ParquetOutputCodecConfig {
     @JsonProperty("exclude_keys")
     private List<String> excludeKeys = DEFAULT_EXCLUDE_KEYS;
 
+    @JsonProperty("tabular_schema")
+    private String tabularSchema;
+
     @Valid
     @Size(max = 0, message = "Schema from Schema Registry is not supported.")
     @JsonProperty("schema_registry_url")
@@ -145,5 +148,8 @@ public void setSchemaRegistryUrl(String schemaRegistryUrl) {
     public void setExcludeKeys(List<String> excludeKeys) {
         this.excludeKeys = excludeKeys;
     }
+    public String getTabularSchema() {
+        return tabularSchema;
+    }
 }