diff --git a/.github/workflows/metadata-ingestion.yml b/.github/workflows/metadata-ingestion.yml index 23d7ee9427f42..fb70c85fdec93 100644 --- a/.github/workflows/metadata-ingestion.yml +++ b/.github/workflows/metadata-ingestion.yml @@ -25,7 +25,7 @@ jobs: metadata-ingestion: runs-on: ubuntu-latest env: - SPARK_VERSION: 3.0.3 + SPARK_VERSION: 3.3.2 DATAHUB_TELEMETRY_ENABLED: false # TODO: Enable this once the test is fixed. # DATAHUB_LOOKML_GIT_TEST_SSH_KEY: ${{ secrets.DATAHUB_LOOKML_GIT_TEST_SSH_KEY }} diff --git a/metadata-ingestion/docs/sources/s3/s3.md b/metadata-ingestion/docs/sources/s3/s3.md index 93715629d0b8e..9484cd8de6666 100644 --- a/metadata-ingestion/docs/sources/s3/s3.md +++ b/metadata-ingestion/docs/sources/s3/s3.md @@ -196,3 +196,9 @@ If you are ingesting datasets from AWS S3, we recommend running the ingestion on Profiles are computed with PyDeequ, which relies on PySpark. Therefore, for computing profiles, we currently require Spark 3.0.3 with Hadoop 3.2 to be installed and the `SPARK_HOME` and `SPARK_VERSION` environment variables to be set. The Spark+Hadoop binary can be downloaded [here](https://www.apache.org/dyn/closer.lua/spark/spark-3.0.3/spark-3.0.3-bin-hadoop3.2.tgz). For an example guide on setting up PyDeequ on AWS, see [this guide](https://aws.amazon.com/blogs/big-data/testing-data-quality-at-scale-with-pydeequ/). + +:::caution + +From Spark 3.2.0+, Avro reader fails on column names that don't start with a letter and contains other character than letters, number, and underscore. [https://github.com/apache/spark/blob/72c62b6596d21e975c5597f8fff84b1a9d070a02/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala#L158] +Avro files that contain such columns won't be profiled. +::: \ No newline at end of file diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index ded9186e08a22..9195dab7bf5b7 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -247,8 +247,8 @@ def get_long_description(): } data_lake_profiling = { - "pydeequ>=1.0.1, <1.1", - "pyspark==3.0.3", + "pydeequ==1.1.0", + "pyspark~=3.3.0", } delta_lake = { diff --git a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py index 4247ee9330cfb..ab5d3a4e007ac 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/s3/source.py +++ b/metadata-ingestion/src/datahub/ingestion/source/s3/source.py @@ -261,13 +261,14 @@ def init_spark(self): import pydeequ conf = SparkConf() - + spark_version = os.getenv("SPARK_VERSION", "3.3") conf.set( "spark.jars.packages", ",".join( [ "org.apache.hadoop:hadoop-aws:3.0.3", - "org.apache.spark:spark-avro_2.12:3.0.3", + # Spark's avro version needs to be matched with the Spark version + f"org.apache.spark:spark-avro_2.12:{spark_version}{'.0' if spark_version.count('.') == 1 else ''}", pydeequ.deequ_maven_coord, ] ), @@ -374,10 +375,10 @@ def read_file_spark(self, file: str, ext: str) -> Optional[DataFrame]: elif ext.endswith(".avro"): try: df = self.spark.read.format("avro").load(file) - except AnalysisException: + except AnalysisException as e: self.report.report_warning( file, - "To ingest avro files, please install the spark-avro package: https://mvnrepository.com/artifact/org.apache.spark/spark-avro_2.12/3.0.3", + f"Avro file reading failed with exception. The error was: {e}", ) return None diff --git a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json index ceec764bfbc86..d59fce788c95e 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_files.json @@ -2782,7 +2782,7 @@ "customProperties": { "schema_inferred_from": "tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -2820,62 +2820,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } @@ -2939,7 +2939,58 @@ "columnCount": 5, "fieldProfiles": [ { - "fieldPath": "1st chord", + "fieldPath": "FirstChord", + "uniqueCount": 5, + "uniqueProportion": 0.17857142857142858, + "nullCount": 0, + "nullProportion": 0.0, + "distinctValueFrequencies": [ + { + "value": "1", + "frequency": 19 + }, + { + "value": "2", + "frequency": 3 + }, + { + "value": "4", + "frequency": 2 + }, + { + "value": "5", + "frequency": 1 + }, + { + "value": "6", + "frequency": 3 + } + ], + "sampleValues": [ + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "2", + "4", + "5", + "6", + "6", + "6" + ] + }, + { + "fieldPath": "SecondChord", "uniqueCount": 5, "uniqueProportion": 0.17857142857142858, "nullCount": 0, @@ -2990,7 +3041,7 @@ ] }, { - "fieldPath": "2nd chord", + "fieldPath": "ThirdChord", "uniqueCount": 7, "uniqueProportion": 0.25, "nullCount": 0, @@ -3049,7 +3100,7 @@ ] }, { - "fieldPath": "3rd chord", + "fieldPath": "FourthChord", "uniqueCount": 6, "uniqueProportion": 0.21428571428571427, "nullCount": 0, @@ -3104,7 +3155,7 @@ ] }, { - "fieldPath": "4th chord", + "fieldPath": "ProgressionQuality", "uniqueCount": 20, "uniqueProportion": 0.7142857142857143, "nullCount": 0, @@ -3213,41 +3264,6 @@ "Sweet", "Wistful" ] - }, - { - "fieldPath": "Progression Quality", - "uniqueCount": 1, - "uniqueProportion": 0.03571428571428571, - "nullCount": 0, - "nullProportion": 0.0, - "distinctValueFrequencies": [ - { - "value": "NaN", - "frequency": 28 - } - ], - "sampleValues": [ - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan" - ] } ] } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_spec_for_files.json b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_spec_for_files.json index 1bd75ae457cb4..ed2c992655a89 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_spec_for_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_spec_for_files.json @@ -9,7 +9,7 @@ "customProperties": { "schema_inferred_from": "tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -47,62 +47,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } @@ -1046,7 +1046,58 @@ "columnCount": 5, "fieldProfiles": [ { - "fieldPath": "1st chord", + "fieldPath": "FirstChord", + "uniqueCount": 5, + "uniqueProportion": 0.17857142857142858, + "nullCount": 0, + "nullProportion": 0.0, + "distinctValueFrequencies": [ + { + "value": "1", + "frequency": 19 + }, + { + "value": "2", + "frequency": 3 + }, + { + "value": "4", + "frequency": 2 + }, + { + "value": "5", + "frequency": 1 + }, + { + "value": "6", + "frequency": 3 + } + ], + "sampleValues": [ + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "2", + "4", + "5", + "6", + "6", + "6" + ] + }, + { + "fieldPath": "SecondChord", "uniqueCount": 5, "uniqueProportion": 0.17857142857142858, "nullCount": 0, @@ -1097,7 +1148,7 @@ ] }, { - "fieldPath": "2nd chord", + "fieldPath": "ThirdChord", "uniqueCount": 7, "uniqueProportion": 0.25, "nullCount": 0, @@ -1156,7 +1207,7 @@ ] }, { - "fieldPath": "3rd chord", + "fieldPath": "FourthChord", "uniqueCount": 6, "uniqueProportion": 0.21428571428571427, "nullCount": 0, @@ -1211,7 +1262,7 @@ ] }, { - "fieldPath": "4th chord", + "fieldPath": "ProgressionQuality", "uniqueCount": 20, "uniqueProportion": 0.7142857142857143, "nullCount": 0, @@ -1320,41 +1371,6 @@ "Sweet", "Wistful" ] - }, - { - "fieldPath": "Progression Quality", - "uniqueCount": 1, - "uniqueProportion": 0.03571428571428571, - "nullCount": 0, - "nullProportion": 0.0, - "distinctValueFrequencies": [ - { - "value": "NaN", - "frequency": 28 - } - ], - "sampleValues": [ - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan" - ] } ] } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_specs_of_different_buckets.json b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_specs_of_different_buckets.json index b9687b97571cb..f7793140fe033 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_specs_of_different_buckets.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_multiple_specs_of_different_buckets.json @@ -9,7 +9,7 @@ "customProperties": { "schema_inferred_from": "tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -47,62 +47,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } @@ -1046,7 +1046,58 @@ "columnCount": 5, "fieldProfiles": [ { - "fieldPath": "1st chord", + "fieldPath": "FirstChord", + "uniqueCount": 5, + "uniqueProportion": 0.17857142857142858, + "nullCount": 0, + "nullProportion": 0.0, + "distinctValueFrequencies": [ + { + "value": "1", + "frequency": 19 + }, + { + "value": "2", + "frequency": 3 + }, + { + "value": "4", + "frequency": 2 + }, + { + "value": "5", + "frequency": 1 + }, + { + "value": "6", + "frequency": 3 + } + ], + "sampleValues": [ + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "2", + "4", + "5", + "6", + "6", + "6" + ] + }, + { + "fieldPath": "SecondChord", "uniqueCount": 5, "uniqueProportion": 0.17857142857142858, "nullCount": 0, @@ -1097,7 +1148,7 @@ ] }, { - "fieldPath": "2nd chord", + "fieldPath": "ThirdChord", "uniqueCount": 7, "uniqueProportion": 0.25, "nullCount": 0, @@ -1156,7 +1207,7 @@ ] }, { - "fieldPath": "3rd chord", + "fieldPath": "FourthChord", "uniqueCount": 6, "uniqueProportion": 0.21428571428571427, "nullCount": 0, @@ -1211,7 +1262,7 @@ ] }, { - "fieldPath": "4th chord", + "fieldPath": "ProgressionQuality", "uniqueCount": 20, "uniqueProportion": 0.7142857142857143, "nullCount": 0, @@ -1320,41 +1371,6 @@ "Sweet", "Wistful" ] - }, - { - "fieldPath": "Progression Quality", - "uniqueCount": 1, - "uniqueProportion": 0.03571428571428571, - "nullCount": 0, - "nullProportion": 0.0, - "distinctValueFrequencies": [ - { - "value": "NaN", - "frequency": 28 - } - ], - "sampleValues": [ - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan" - ] } ] } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_single_file.json b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_single_file.json index a5a68777cad5c..f54c62865bcde 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_single_file.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/local/golden_mces_single_file.json @@ -9,7 +9,7 @@ "customProperties": { "schema_inferred_from": "tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -47,62 +47,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } @@ -1046,7 +1046,58 @@ "columnCount": 5, "fieldProfiles": [ { - "fieldPath": "1st chord", + "fieldPath": "FirstChord", + "uniqueCount": 5, + "uniqueProportion": 0.17857142857142858, + "nullCount": 0, + "nullProportion": 0.0, + "distinctValueFrequencies": [ + { + "value": "1", + "frequency": 19 + }, + { + "value": "2", + "frequency": 3 + }, + { + "value": "4", + "frequency": 2 + }, + { + "value": "5", + "frequency": 1 + }, + { + "value": "6", + "frequency": 3 + } + ], + "sampleValues": [ + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "1", + "2", + "4", + "5", + "6", + "6", + "6" + ] + }, + { + "fieldPath": "SecondChord", "uniqueCount": 5, "uniqueProportion": 0.17857142857142858, "nullCount": 0, @@ -1097,7 +1148,7 @@ ] }, { - "fieldPath": "2nd chord", + "fieldPath": "ThirdChord", "uniqueCount": 7, "uniqueProportion": 0.25, "nullCount": 0, @@ -1156,7 +1207,7 @@ ] }, { - "fieldPath": "3rd chord", + "fieldPath": "FourthChord", "uniqueCount": 6, "uniqueProportion": 0.21428571428571427, "nullCount": 0, @@ -1211,7 +1262,7 @@ ] }, { - "fieldPath": "4th chord", + "fieldPath": "ProgressionQuality", "uniqueCount": 20, "uniqueProportion": 0.7142857142857143, "nullCount": 0, @@ -1320,41 +1371,6 @@ "Sweet", "Wistful" ] - }, - { - "fieldPath": "Progression Quality", - "uniqueCount": 1, - "uniqueProportion": 0.03571428571428571, - "nullCount": 0, - "nullProportion": 0.0, - "distinctValueFrequencies": [ - { - "value": "NaN", - "frequency": 28 - } - ], - "sampleValues": [ - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan", - "nan" - ] } ] } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json index 36d3ba1b3510d..58c225e1ec4c9 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_files.json @@ -949,7 +949,7 @@ "customProperties": { "schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -1003,62 +1003,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_spec_for_files.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_spec_for_files.json index 84ace7d673676..9c41bbdc80c49 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_spec_for_files.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_spec_for_files.json @@ -9,7 +9,7 @@ "customProperties": { "schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -47,62 +47,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_specs_of_different_buckets.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_specs_of_different_buckets.json index f7f3cb8fb743e..985140f774ab4 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_specs_of_different_buckets.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_multiple_specs_of_different_buckets.json @@ -9,7 +9,7 @@ "customProperties": { "schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -47,62 +47,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } diff --git a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_single_file.json b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_single_file.json index 5353d95ada8f7..5d87d423a6a67 100644 --- a/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_single_file.json +++ b/metadata-ingestion/tests/integration/s3/golden-files/s3/golden_mces_single_file.json @@ -9,7 +9,7 @@ "customProperties": { "schema_inferred_from": "s3://my-test-bucket/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro", "number_of_files": "1", - "size_in_bytes": "1024" + "size_in_bytes": "619" }, "name": "chord_progressions_avro.avro", "description": "", @@ -47,62 +47,62 @@ }, "fields": [ { - "fieldPath": "[version=2.0].[type=Root].[type=double].Progression Quality", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FirstChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "double", + "nativeDataType": "FirstChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].1st chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].FourthChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "FourthChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].2nd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].SecondChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "SecondChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=long].3rd chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=long].ThirdChord", + "nullable": false, "type": { "type": { "com.linkedin.schema.NumberType": {} } }, - "nativeDataType": "long", + "nativeDataType": "ThirdChord", "recursive": false, "isPartOfKey": false }, { - "fieldPath": "[version=2.0].[type=Root].[type=string].4th chord", - "nullable": true, + "fieldPath": "[version=2.0].[type=Record].[type=string].ProgressionQuality", + "nullable": false, "type": { "type": { "com.linkedin.schema.StringType": {} } }, - "nativeDataType": "string", + "nativeDataType": "ProgressionQuality", "recursive": false, "isPartOfKey": false } diff --git a/metadata-ingestion/tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro b/metadata-ingestion/tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro index 8a6d9df66bb79..79c329b3f8dca 100644 Binary files a/metadata-ingestion/tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro and b/metadata-ingestion/tests/integration/s3/test_data/local_system/folder_a/folder_aa/folder_aaa/chord_progressions_avro.avro differ diff --git a/metadata-ingestion/tests/integration/s3/test_s3.py b/metadata-ingestion/tests/integration/s3/test_s3.py index 98ae2eaa393ab..462ca88b7c123 100644 --- a/metadata-ingestion/tests/integration/s3/test_s3.py +++ b/metadata-ingestion/tests/integration/s3/test_s3.py @@ -140,7 +140,7 @@ def test_data_lake_s3_ingest( def test_data_lake_local_ingest( pytestconfig, touch_local_files, source_file, tmp_path, mock_time ): - os.environ["SPARK_VERSION"] = "3.0.3" + os.environ["SPARK_VERSION"] = "3.3.2" test_resources_dir = pytestconfig.rootpath / "tests/integration/s3/" f = open(os.path.join(SOURCE_FILES_PATH, source_file)) source = json.load(f)