diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient1.csv b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient1.csv new file mode 100644 index 00000000..2bf60499 --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient1.csv @@ -0,0 +1,5 @@ +! This line is a comment to test spark options +pid,gender,birthDate +p1,male,2000-05-10 +p2,male,1985-05-08 +p3,male,1997-02 \ No newline at end of file diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient2.csv b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient2.csv new file mode 100644 index 00000000..cf0b2bda --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient2.csv @@ -0,0 +1,5 @@ +! This line is a comment to test spark options +pid,gender,birthDate +p4,male,1999-06-05 +p5,male,1965-10-01 +p6,female,1991-03 \ No newline at end of file diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient3.csv b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient3.csv new file mode 100644 index 00000000..bde9dd4c --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient3.csv @@ -0,0 +1,5 @@ +! This line is a comment to test spark options +pid,gender,birthDate +p7,female,1972-10-25 +p8,female,2010-01-10 +p9,female,1999-05-12 \ No newline at end of file diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient1.json b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient1.json new file mode 100644 index 00000000..28e95146 --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient1.json @@ -0,0 +1,3 @@ +{"pid": "p1", "gender": "male", "birthDate": "2000-05-10"}, +{"pid": "p2", "gender": "male", "birthDate": "1985-05-08"}, +{"pid": "p3", "gender": "male", "birthDate": "1997-02"} \ No newline at end of file diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient2.json b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient2.json new file mode 100644 index 00000000..e9aeb610 --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient2.json @@ -0,0 +1,3 @@ +{"pid": "p4", "gender": "male", "birthDate": "1999-06-05"}, +{"pid": "p5", "gender": "male", "birthDate": "1965-10-01"}, +{"pid": "p6", "gender": "female", "birthDate": "1991-03"} \ No newline at end of file diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient3.json b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient3.json new file mode 100644 index 00000000..6b5b9e9a --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient3.json @@ -0,0 +1,3 @@ +{"pid": "p7", "gender": "female", "birthDate": "1972-10-25"}, +{"pid": "p8", "gender": "female", "birthDate": "2010-01-10"}, +{"pid": "p9", "gender": "female", "birthDate": "1999-05-12"} \ No newline at end of file diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/parquet/patients1.parquet b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/parquet/patients1.parquet new file mode 100644 index 00000000..502c5647 Binary files /dev/null and b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/parquet/patients1.parquet differ diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/parquet/patients2.parquet b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/parquet/patients2.parquet new file mode 100644 index 00000000..154a62b8 Binary files /dev/null and b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/parquet/patients2.parquet differ diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/parquet/patients3.parquet b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/parquet/patients3.parquet new file mode 100644 index 00000000..44f7f533 Binary files /dev/null and b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/parquet/patients3.parquet differ diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-csv/patient1.txt b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-csv/patient1.txt new file mode 100644 index 00000000..5cf9bd66 --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-csv/patient1.txt @@ -0,0 +1,4 @@ +pid,gender,birthDate +p1,male,2000-05-10 +p2,male,1985-05-08 +p3,male,1997-02 \ No newline at end of file diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-csv/patient2.txt b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-csv/patient2.txt new file mode 100644 index 00000000..5a9ac92e --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-csv/patient2.txt @@ -0,0 +1,4 @@ +pid,gender,birthDate +p4,male,1999-06-05 +p5,male,1965-10-01 +p6,female,1991-03 \ No newline at end of file diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-csv/patient3.txt b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-csv/patient3.txt new file mode 100644 index 00000000..7b84ae4a --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-csv/patient3.txt @@ -0,0 +1,4 @@ +pid,gender,birthDate +p7,female,1972-10-25 +p8,female,2010-01-10 +p9,female,1999-05-12 \ No newline at end of file diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-ndjson/patient1.txt b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-ndjson/patient1.txt new file mode 100644 index 00000000..735f6d42 --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-ndjson/patient1.txt @@ -0,0 +1,3 @@ +{"pid": "p1", "gender": "male", "birthDate": "2000-05-10"} +{"pid": "p2", "gender": "male", "birthDate": "1985-05-08"} +{"pid": "p3", "gender": "male", "birthDate": "1997-02"} \ No newline at end of file diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-ndjson/patient2.txt b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-ndjson/patient2.txt new file mode 100644 index 00000000..ef95e890 --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-ndjson/patient2.txt @@ -0,0 +1,3 @@ +{"pid": "p4", "gender": "male", "birthDate": "1999-06-05"} +{"pid": "p5", "gender": "male", "birthDate": "1965-10-01"} +{"pid": "p6", "gender": "female", "birthDate": "1991-03"} \ No newline at end of file diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-ndjson/patient3.txt b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-ndjson/patient3.txt new file mode 100644 index 00000000..258e6200 --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-ndjson/patient3.txt @@ -0,0 +1,3 @@ +{"pid": "p7", "gender": "female", "birthDate": "1972-10-25"} +{"pid": "p8", "gender": "female", "birthDate": "2010-01-10"} +{"pid": "p9", "gender": "female", "birthDate": "1999-05-12"} \ No newline at end of file diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients-ndjson.txt b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients-ndjson.txt new file mode 100644 index 00000000..1a650ded --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients-ndjson.txt @@ -0,0 +1,11 @@ +{ "pid": "p1", "gender": "male", "birthDate": "2000-05-10", "deceasedDateTime": null, "homePostalCode": null } +{ "pid": "p2", "gender": "male", "birthDate": "1985-05-08", "deceasedDateTime": "2017-03-10", "homePostalCode": "G02547" } +{ "pid": "p3", "gender": "male", "birthDate": "1997-02", "deceasedDateTime": null, "homePostalCode": null } +{ "pid": "p4", "gender": "male", "birthDate": "1999-06-05", "deceasedDateTime": null, "homePostalCode": "H10564" } +{ "pid": "p5", "gender": "male", "birthDate": "1965-10-01", "deceasedDateTime": "2019-04-21", "homePostalCode": "G02547" } +{ "pid": "p6", "gender": "female", "birthDate": "1991-03", "deceasedDateTime": null, "homePostalCode": null } +/* This line is a comment to test spark options */ +{ "pid": "p7", "gender": "female", "birthDate": "1972-10-25", "deceasedDateTime": null, "homePostalCode": "V13135" } +{ "pid": "p8", "gender": "female", "birthDate": "2010-01-10", "deceasedDateTime": null, "homePostalCode": "Z54564" } +{ "pid": "p9", "gender": "female", "birthDate": "1999-05-12", "deceasedDateTime": null, "homePostalCode": null } +{ "pid": "p10", "gender": "female", "birthDate": "2003-11", "deceasedDateTime": null, "homePostalCode": null } diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.csv b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.csv new file mode 100644 index 00000000..dc6b243e --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.csv @@ -0,0 +1,12 @@ +! This file has trailing spaces for testing purposes +pid,gender,birthDate,deceasedDateTime ,homePostalCode +p1 ,male,2000-05-10,, +p2,male,1985-05-08,2017-03-10,G02547 +p3,male,1997-02,, +p4,male,1999-06-05,,H10564 +p5,male,1965-10-01,2019-04-21,G02547 +p6,female ,1991-03,, +p7 ,female,1972-10-25,,V13135 +p8,female,2010-01-10,,Z54564 +p9,female,1999-05-12,, +p10,female,2003-11 ,, diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.json b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.json new file mode 100644 index 00000000..b3741068 --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.json @@ -0,0 +1,11 @@ +{ "pid": "p1", "gender": "male", "birthDate": "2000-05-10", "deceasedDateTime": null, "homePostalCode": null }, +{ "pid": "p2", "gender": "male", "birthDate": "1985-05-08", "deceasedDateTime": "2017-03-10", "homePostalCode": "G02547" }, +{ "pid": "p3", "gender": "male", "birthDate": "1997-02", "deceasedDateTime": null, "homePostalCode": null }, +{ "pid": "p4", "gender": "male", "birthDate": "1999-06-05", "deceasedDateTime": null, "homePostalCode": "H10564" }, +{ "pid": "p5", "gender": "male", "birthDate": "1965-10-01", "deceasedDateTime": "2019-04-21", "homePostalCode": "G02547" }, +{ "pid": "p6", "gender": "female", "birthDate": "1991-03", "deceasedDateTime": null, "homePostalCode": null }, +/* This line is a comment to test spark options */ +{ "pid": "p7", "gender": "female", "birthDate": "1972-10-25", "deceasedDateTime": null, "homePostalCode": "V13135" }, +{ "pid": "p8", "gender": "female", "birthDate": "2010-01-10", "deceasedDateTime": null, "homePostalCode": "Z54564" }, +{ "pid": "p9", "gender": "female", "birthDate": "1999-05-12", "deceasedDateTime": null, "homePostalCode": null }, +{ "pid": "p10", "gender": "female", "birthDate": "2003-11", "deceasedDateTime": null, "homePostalCode": null } \ No newline at end of file diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.parquet b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.parquet new file mode 100644 index 00000000..d2b5a882 Binary files /dev/null and b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.parquet differ diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.tsv b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.tsv new file mode 100644 index 00000000..987ba87d --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.tsv @@ -0,0 +1,11 @@ +pid gender birthDate deceasedDateTime homePostalCode +p1 male 2000-05-10 +p2 male 1985-05-08 2017-03-10 G02547 +p3 male 1997-02 +p4 male 1999-06-05 H10564 +p5 male 1965-10-01 2019-04-21 G02547 +p6 female 1991-03 +p7 female 1972-10-25 V13135 +p8 female 2010-01-10 Z54564 +p9 female 1999-05-12 +p10 female 2003-11 diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.txt b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.txt new file mode 100644 index 00000000..dc6b243e --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.txt @@ -0,0 +1,12 @@ +! This file has trailing spaces for testing purposes +pid,gender,birthDate,deceasedDateTime ,homePostalCode +p1 ,male,2000-05-10,, +p2,male,1985-05-08,2017-03-10,G02547 +p3,male,1997-02,, +p4,male,1999-06-05,,H10564 +p5,male,1965-10-01,2019-04-21,G02547 +p6,female ,1991-03,, +p7 ,female,1972-10-25,,V13135 +p8,female,2010-01-10,,Z54564 +p9,female,1999-05-12,, +p10,female,2003-11 ,, diff --git a/tofhir-engine/src/test/scala/io/tofhir/test/engine/data/read/FileDataSourceReaderTest.scala b/tofhir-engine/src/test/scala/io/tofhir/test/engine/data/read/FileDataSourceReaderTest.scala new file mode 100644 index 00000000..a6e2be9f --- /dev/null +++ b/tofhir-engine/src/test/scala/io/tofhir/test/engine/data/read/FileDataSourceReaderTest.scala @@ -0,0 +1,471 @@ +package io.tofhir.test.engine.data.read + +import io.tofhir.engine.config.ToFhirConfig +import io.tofhir.engine.data.read.FileDataSourceReader +import io.tofhir.engine.model.{FileSystemSource, FileSystemSourceSettings, SourceFileFormats} +import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.scalatest.BeforeAndAfterAll +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.must.Matchers.{contain, include} +import org.scalatest.matchers.should.Matchers.convertToAnyShouldWrapper + +import java.nio.file.Paths +import java.sql.Timestamp +import java.text.SimpleDateFormat +import scala.language.postfixOps + +/** + * Unit tests for the FileDataSourceReader class. + * Tests the functionality of reading files in different formats. + */ +class FileDataSourceReaderTest extends AnyFlatSpec with BeforeAndAfterAll { + + /** + * SparkSession used for the test cases. + */ + val sparkSession: SparkSession = ToFhirConfig.sparkSession + /** + * Path to the directory containing test data for FileDataSourceReader. + */ + val testDataFolderPath: String = Paths.get(getClass.getResource("/file-data-source-reader-test-data").toURI).toAbsolutePath.toString + /** + * Instance of FileDataSourceReader used to read data files during tests. + */ + val fileDataSourceReader = new FileDataSourceReader(sparkSession) + /** + * Date format used for parsing and formatting date values in test cases. + */ + val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd") + + override def beforeAll(): Unit = { + super.beforeAll() + } + + override def afterAll(): Unit = { + super.afterAll() + } + + /** + * Tests that the FileDataSourceReader correctly handles invalid input by throwing the appropriate exceptions. + * + * The test case covers two scenarios: + * 1. Providing a file path instead of a directory for a streaming job should result in an IllegalArgumentException. + * 2. Providing unsupported file formats or extensions should result in a NotImplementedError. + * + * The following configurations are used for the tests: + * - `illegalArgumentSourceBinding`: A source binding with a 'file' path to test the directory requirement for streaming jobs. + * - `unsupportedFileFormatSourceBinding`: A source binding with an unsupported file format to test the unsupported format handling. + * - `unsupportedExtensionSourceBinding`: A source binding with an unsupported file extension to test the unsupported extension handling. + * - `streamJobSourceSettings`: Mapping job source settings configured for a streaming job. + * - `batchJobSourceSettings`: Mapping job source settings configured for a batch job. + * + * The test verifies the following: + * 1. An IllegalArgumentException is thrown with the expected message when a file path is provided instead of a directory for a streaming job. + * 2. A NotImplementedError is thrown for unsupported file formats and extensions, indicating that these cases are not yet implemented or handled. + * + */ + it should "throw IllegalArgumentException, NotImplementedError when necessary" in { + // Folder including the test files belong to this test + val folderPath = "/single-file-test" + + // Test case 1: Verify that providing a file path instead of a directory throws an IllegalArgumentException + val fileName: String = "patients.csv" + val illegalArgumentSourceBinding = FileSystemSource(path = fileName) + val streamJobSourceSettings = FileSystemSourceSettings(name = "FileDataSourceReaderTest0", sourceUri = "test-uri", dataFolderPath = testDataFolderPath.concat(folderPath), asStream = true) + val exception = intercept[IllegalArgumentException] { + fileDataSourceReader.read(illegalArgumentSourceBinding, streamJobSourceSettings, Option.empty) + } + exception.getMessage should include(s"${fileName} is not a directory. For streaming job, you should provide a directory.") + + // Test case 2: Verify that unsupported file formats and extensions throw a NotImplementedError + val unsupportedFileFormatSourceBinding = FileSystemSource(path = fileName, fileFormat = Some("UNSUPPORTED")) + val unsupportedExtensionSourceBinding = FileSystemSource(path = "patients.UNSUPPORTED") + val batchJobSourceSettings = streamJobSourceSettings.copy(asStream = false) + assertThrows[NotImplementedError] { + fileDataSourceReader.read(unsupportedFileFormatSourceBinding, batchJobSourceSettings, Option.empty) + } + assertThrows[NotImplementedError] { + fileDataSourceReader.read(unsupportedExtensionSourceBinding, batchJobSourceSettings, Option.empty) + } + } + + /** + * Tests that the FileDataSourceReader correctly reads data from CSV, TSV, and TXT_CSV files. + * + * This test verifies that the reader can handle different file formats and produce the expected results. + * The test covers the following formats: + * 1. CSV + * 2. TSV + * 3. TXT_CSV (Text file with CSV-like format) + * + * The test uses the following source binding configurations: + * FileSystemSource(path = "patients.csv", fileFormat = None, options = Map("ignoreTrailingWhiteSpace" -> "true", "comment" -> "!")) + * FileSystemSource(path = "patients.tsv", fileFormat = None, options = Map("ignoreTrailingWhiteSpace" -> "true", "comment" -> "!")) + * FileSystemSource(path = "patients.txt", fileFormat = "txt-csv", options = Map("ignoreTrailingWhiteSpace" -> "true", "comment" -> "!")) + * + * The expected read result for all file formats: + * +---+------+-------------------+----------------+--------------+ + * |pid|gender| birthDate|deceasedDateTime|homePostalCode| + * +---+------+-------------------+----------------+--------------+ + * | p1| male|2000-05-10 00:00:00| NULL| NULL| + * | p2| male|1985-05-08 00:00:00| 2017-03-10| G02547| + * | p3| male|1997-02-01 00:00:00| NULL| NULL| + * | p4| male|1999-06-05 00:00:00| NULL| H10564| + * | p5| male|1965-10-01 00:00:00| 2019-04-21| G02547| + * | p6|female|1991-03-01 00:00:00| NULL| NULL| + * | p7|female|1972-10-25 00:00:00| NULL| V13135| + * | p8|female|2010-01-10 00:00:00| NULL| Z54564| + * | p9|female|1999-05-12 00:00:00| NULL| NULL| + * |p10|female|2003-11-01 00:00:00| NULL| NULL| + * +---+------+-------------------+----------------+--------------+ + * + */ + it should "correctly read from CSV, TSV, and TXT_CSV files" in { + // Folder containing the test files for this test + val folderPath = "/single-file-test" + + // Expected values for validation + val expectedRowNumber = 10 + val expectedColumns = Array("pid", "gender", "birthDate", "deceasedDateTime", "homePostalCode") + val expectedFirstRow = Row("p1", "male", new Timestamp(dateFormat.parse("2000-05-10").getTime), null, null) + val expectedLastRow = Row("p10", "female", new Timestamp(dateFormat.parse("2003-11-01").getTime), null, null) + + // A sequence of file names and their corresponding formats to be tested + val sourceBindingConfigurations = Seq( + ("patients.csv", None), + ("patients.tsv", None), + ("patients.txt", Some(SourceFileFormats.TXT_CSV)) + ) + + // Spark options to test if options are working + val sparkOptions = Map( + "ignoreTrailingWhiteSpace" -> "true", + "comment" -> "!" + ) + + // Loop through each source binding configuration to run the test + val mappingJobSourceSettings = FileSystemSourceSettings(name = "FileDataSourceReaderTest1", sourceUri = "test-uri", dataFolderPath = testDataFolderPath.concat(folderPath)) + sourceBindingConfigurations.foreach { case (fileName, fileFormat) => + // Read the data using the reader and the defined settings + val mappingSourceBinding = FileSystemSource(path = fileName, fileFormat = fileFormat, options = sparkOptions) + val result: DataFrame = fileDataSourceReader.read(mappingSourceBinding, mappingJobSourceSettings, Option.empty) + + // Validate the result + result.count() shouldBe expectedRowNumber + result.columns shouldBe expectedColumns + result.first() shouldBe expectedFirstRow + result.collect().last shouldBe expectedLastRow + } + } + + /** + * Tests that the FileDataSourceReader correctly reads multiple files from CSV and TXT_CSV folders. + * + * This test verifies that the reader can handle multiple files across different file formats + * and produce the expected results. The test covers reading from folders containing: + * 1. CSV files + * 2. TXT_CSV (Text files with CSV-like format) + * + * The test uses the following source binding configurations: + * FileSystemSource(path = "csv", fileFormat = Some(SourceFileFormats.CSV), options = Map("ignoreTrailingWhiteSpace" -> "true", "comment" -> "!")) + * FileSystemSource(path = "txt-csv", fileFormat = Some(SourceFileFormats.TXT_CSV), options = Map("ignoreTrailingWhiteSpace" -> "true", "comment" -> "!")) + * + * The expected read result for both folder formats: + * +---+------+-------------------+ + * |pid|gender| birthDate| + * +---+------+-------------------+ + * | p1| male|2000-05-10 00:00:00| + * | p2| male|1985-05-08 00:00:00| + * | p3| male|1997-02-01 00:00:00| + * | p4| male|1999-06-05 00:00:00| + * | p5| male|1965-10-01 00:00:00| + * | p6|female|1991-03-01 00:00:00| + * | p7|female|1972-10-25 00:00:00| + * | p8|female|2010-01-10 00:00:00| + * | p9|female|1999-05-12 00:00:00| + * +---+------+-------------------+ + * (Rows may appear in different groupings, with each file contributing a distinct set of 3 rows.) + * + */ + it should "correctly read multiple files from CSV, TXT_CSV folders" in { + // Folder including the test folders belong to this test + val folderPath = "/folder-test" + + // Expected values for validation + val expectedRowNumber = 9 + val expectedColumns = Array("pid", "gender", "birthDate") + val expectedRows = Set( // One row from each file + Row("p1", "male", new Timestamp(dateFormat.parse("2000-05-10").getTime)), + Row("p4", "male", new Timestamp(dateFormat.parse("1999-06-05").getTime)), + Row("p7", "female", new Timestamp(dateFormat.parse("1972-10-25").getTime)) + ) + + // A sequence of folder names and file format of the files to be selected + val sourceBindingConfigurations = Seq( + ("csv", Some(SourceFileFormats.CSV)), + ("txt-csv", Some(SourceFileFormats.TXT_CSV)) + ) + // Spark options to test if options are working + val sparkOptions = Map( + "ignoreTrailingWhiteSpace" -> "true", + "comment" -> "!" + ) + + // Loop through each source binding configuration to run the test + val mappingJobSourceSettings = FileSystemSourceSettings(name = "FileDataSourceReaderTest2", sourceUri = "test-uri", dataFolderPath = testDataFolderPath.concat(folderPath)) + sourceBindingConfigurations.foreach { case (folderName, fileFormat) => + // Read the data using the reader and the defined settings + val mappingSourceBinding = FileSystemSource(path = folderName, fileFormat = fileFormat, options = sparkOptions) + val result: DataFrame = fileDataSourceReader.read(mappingSourceBinding, mappingJobSourceSettings, Option.empty) + + // Validate the result + result.count() shouldBe expectedRowNumber + result.columns shouldBe expectedColumns + result.collect().toSet should contain allElementsOf expectedRows + } + } + + /** + * Tests that the FileDataSourceReader correctly reads data from JSON and TXT_NDJSON files. + * + * This test verifies that the reader can handle different file formats and produce the expected results. + * The test covers the following formats: + * 1. JSON + * 2. TXT_NDJSON (Text file with newline-delimited JSON format) + * + * The test uses the following source binding configurations: + * FileSystemSource(path = "patients.json", fileFormat = None, options = Map("allowComments" -> "true")) + * FileSystemSource(path = "patients-ndjson.txt", fileFormat = Some(SourceFileFormats.TXT_NDJSON), options = Map("allowComments" -> "true")) + * + * The expected read result is for both file formats: + * +------------+----------------+------+--------------+---+ + * | birthDate |deceasedDateTime|gender|homePostalCode|pid| + * +------------+----------------+------+--------------+---+ + * |2000-05-10 | NULL| male| NULL| p1| + * |1985-05-08 | 2017-03-10| male| G02547| p2| + * |1997-02 | NULL| male| NULL| p3| + * |1999-06-05 | NULL| male| H10564| p4| + * |1965-10-01 | 2019-04-21| male| G02547| p5| + * |1991-03 | NULL|female| NULL| p6| + * |1972-10-25 | NULL|female| V13135| p7| + * |2010-01-10 | NULL|female| Z54564| p8| + * |1999-05-12 | NULL|female| NULL| p9| + * |2003-11 | NULL|female| NULL|p10| + * +------------+----------------+------+--------------+---+ + * + */ + it should "correctly read from JSON and TXT-NDJSON files" in { + // Folder including the test files + val folderPath = "/single-file-test" + + // Define the expected values for validation (Note: Spark reads json columns in alphabetic order) + val expectedRowNumber = 10 + val expectedColumns = Array("birthDate", "deceasedDateTime", "gender", "homePostalCode", "pid") + val expectedFirstRow = Row("2000-05-10", null, "male", null, "p1") + val expectedLastRow = Row("2003-11", null, "female", null, "p10") + + // Define the file names and their corresponding formats to be tested + val sourceBindingConfigurations = Seq( + ("patients.json", None), + ("patients-ndjson.txt", Some(SourceFileFormats.TXT_NDJSON)) + ) + // Spark options to test if options are working + val sparkOptions = Map( + "allowComments" -> "true", + ) + + // Loop through each source binding configuration to run the test + val mappingJobSourceSettings = FileSystemSourceSettings(name = s"FileDataSourceReaderTest3", sourceUri = "test-uri", dataFolderPath = testDataFolderPath.concat(folderPath)) + sourceBindingConfigurations.foreach { case (fileName, fileFormat) => + // Define the source binding and settings for reading the file + val mappingSourceBinding = FileSystemSource(path = fileName, fileFormat = fileFormat, options = sparkOptions) + // Read the data from the specified file + val result: DataFrame = fileDataSourceReader.read(mappingSourceBinding, mappingJobSourceSettings, Option.empty) + + // Validate the result + result.count() shouldBe expectedRowNumber + result.columns shouldBe expectedColumns + result.first() shouldBe expectedFirstRow + result.collect().last shouldBe expectedLastRow + } + } + + /** + * Tests that the FileDataSourceReader correctly reads multiple files from JSON and NDJSON folders. + * + * This test verifies that the reader can handle multiple files across different file formats + * and produce the expected results. The test covers reading from folders containing: + * 1. JSON (standard JSON files in the "json" folder) + * 2. TXT_NDJSON (newline-delimited JSON files in the "txt-ndjson" folder) + * + * The test uses the following source binding configurations: + * FileSystemSource(path = "json", fileFormat = Some(SourceFileFormats.JSON)) + * FileSystemSource(path = "txt-ndjson", fileFormat = Some(SourceFileFormats.TXT_NDJSON)) + * + * The expected read result for both formats: + * +----------+------+---+ + * | birthDate|gender|pid| + * +----------+------+---+ + * |2000-05-10| male| p1| + * |1985-05-08| male| p2| + * | 1997-02| male| p3| + * |1999-06-05| male| p4| + * |1965-10-01| male| p5| + * | 1991-03|female| p6| + * |1972-10-25|female| p7| + * |2010-01-10|female| p8| + * |1999-05-12|female| p9| + * +----------+------+---+ + * (Rows may appear in different groupings, with each file contributing a distinct set of 3 rows.) + */ + it should "correctly read multiple files from JSON and NDJSON folders" in { + // Folder containing the test folders for JSON and NDJSON files + val folderPath = "/folder-test" + + // Expected values for validation + val expectedRowNumber = 9 + val expectedColumns = Array("birthDate", "gender", "pid") + // Expected rows for validation, one row from each file + val expectedRows = Set( + Row("2000-05-10", "male", "p1"), + Row("1999-06-05", "male", "p4"), + Row("1972-10-25", "female", "p7") + ) + + // A sequence of folder names and file format of the files to be selected + val sourceBindingConfigurations = Seq( + ("json", Some(SourceFileFormats.JSON)), + ("txt-ndjson", Some(SourceFileFormats.TXT_NDJSON)) + ) + + // Loop through each source binding configuration to run the test + val mappingJobSourceSettings = FileSystemSourceSettings(name = "FileDataSourceReaderTest4", sourceUri = "test-uri", dataFolderPath = testDataFolderPath.concat(folderPath)) + sourceBindingConfigurations.foreach { case (folderName, fileFormat) => + // Read the data using the reader and the defined settings + val mappingSourceBinding = FileSystemSource(path = folderName, fileFormat = fileFormat) + val result: DataFrame = fileDataSourceReader.read(mappingSourceBinding, mappingJobSourceSettings, Option.empty) + + // Validate the result + result.count() shouldBe expectedRowNumber + result.columns shouldBe expectedColumns + result.collect().toSet should contain allElementsOf expectedRows + } + } + + /** + * Tests that the FileDataSourceReader correctly reads data from a Parquet file. + * + * This test verifies that the reader can handle Parquet file format and produce the expected results. + * The test covers the following format: + * 1. PARQUET + * + * The test uses the following source binding configuration: + * FileSystemSource(path = "patients.parquet", fileFormat = Some(SourceFileFormats.PARQUET)) + * + * The expected read result for the Parquet file is: + * +---+------+----------+-------------------+--------------+ + * |pid|gender| birthDate|deceasedDateTime |homePostalCode| + * +---+------+----------+-------------------+--------------+ + * | p1| male|2000-05-10| null| null| + * | p2| male|1985-05-08|2017-03-10 | G02547| + * | p3| male|1997-02-01| null| null| + * | p4| male|1999-06-05| null| H10564| + * | p5| male|1965-10-01|2019-04-21 | G02547| + * | p6|female|1991-03-01| null| null| + * | p7|female|1972-10-25| null| V13135| + * | p8|female|2010-01-10| null| Z54564| + * | p9|female|1999-05-12| null| null| + * |p10|female|2003-11 | null| null| + * +---+------+----------+-------------------+--------------+ + * + */ + it should "correctly read from Parquet file" in { + // Folder including the test files + val folderPath = "/single-file-test" + + // Define the expected values for validation + val expectedRowNumber = 10 + val expectedColumns = Array("pid", "gender", "birthDate", "deceasedDateTime", "homePostalCode") + val expectedFirstRow = Row("p1", "male", "2000-05-10", null, null) + val expectedLastRow = Row("p10", "female", "2003-11", null, null) + + // Define the file name and its corresponding format for Parquet + val sourceBindingConfigurations = Seq( + ("patients.parquet", Some(SourceFileFormats.PARQUET)) + ) + + // Loop through each source binding configuration to run the test + val mappingJobSourceSettings = FileSystemSourceSettings(name = s"FileDataSourceReaderTest5", sourceUri = "test-uri", dataFolderPath = testDataFolderPath.concat(folderPath)) + sourceBindingConfigurations.foreach { case (fileName, fileFormat) => + // Define the source binding and settings for reading the file + val mappingSourceBinding = FileSystemSource(path = fileName, fileFormat = fileFormat) + // Read the data from the specified file + val result: DataFrame = fileDataSourceReader.read(mappingSourceBinding, mappingJobSourceSettings, Option.empty) + + // Validate the result + result.count() shouldBe expectedRowNumber + result.columns shouldBe expectedColumns + result.first() shouldBe expectedFirstRow + result.collect().last shouldBe expectedLastRow + } + } + + /** + * Tests that the FileDataSourceReader correctly reads data from Parquet files. + * + * This test verifies that the reader can handle Parquet file format and produce the expected results. + * The test covers the following format: + * 1. PARQUET + * + * The test uses the following source binding configuration: + * FileSystemSource(path = "parquet", fileFormat = Some(SourceFileFormats.PARQUET)) + * + * The expected read result for the Parquet files is: + * +---+------+----------+ + * |pid|gender| birthDate| + * +---+------+----------+ + * | p1| male|2000-05-10| + * | p2| male|1985-05-08| + * | p3| male|1997-02-01| + * | p4| male|1999-06-05| + * | p5| male|1965-10-01| + * | p6|female|1991-03-01| + * | p7|female|1972-10-25| + * | p8|female|2010-01-10| + * | p9|female|1999-05-12| + * +---+------+----------+ + * (Rows may appear in different order, grouped by each file.) + */ + it should "correctly read multiple files from Parquet folders" in { + // Folder including the test folders belonging to this test + val folderPath = "/folder-test" + + // Expected values for validation + val expectedRowNumber = 9 + val expectedColumns = Array("pid", "gender", "birthDate") + // Expected rows for validation, one row from each file + val expectedRows = Set( + Row("p1", "male", "2000-05-10"), + Row("p4", "male", "1999-06-05"), + Row("p7", "female", "1972-10-25") + ) + + // A sequence of folder names and file format of the files to be selected + val sourceBindingConfigurations = Seq( + ("parquet", Some(SourceFileFormats.PARQUET)) + ) + + // Loop through each source binding configuration to run the test + val mappingJobSourceSettings = FileSystemSourceSettings(name = "FileDataSourceReaderTest6", sourceUri = "test-uri", dataFolderPath = testDataFolderPath.concat(folderPath)) + sourceBindingConfigurations.foreach { case (folderName, fileFormat) => + // Read the data using the reader and the defined settings + val mappingSourceBinding = FileSystemSource(path = folderName, fileFormat = fileFormat) + val result: DataFrame = fileDataSourceReader.read(mappingSourceBinding, mappingJobSourceSettings, Option.empty) + + // Validate the result + result.count() shouldBe expectedRowNumber + result.columns shouldBe expectedColumns + result.collect().toSet should contain allElementsOf expectedRows + } + } +} \ No newline at end of file