From 77c543183d8c4908fc3802b0e7ff74854972ceb9 Mon Sep 17 00:00:00 2001 From: emrecam Date: Tue, 10 Sep 2024 15:24:41 +0300 Subject: [PATCH] :wrench: Test(FileDataSourceReader): Add tests for FileDataSourceReader class --- .../folder-test/csv/patient1.csv | 5 + .../folder-test/csv/patient2.csv | 5 + .../folder-test/csv/patient3.csv | 5 + .../folder-test/json/patient1.json | 3 + .../folder-test/json/patient2.json | 3 + .../folder-test/json/patient3.json | 3 + .../folder-test/parquet/patients1.parquet | Bin 0 -> 1091 bytes .../folder-test/parquet/patients2.parquet | Bin 0 -> 1114 bytes .../folder-test/parquet/patients3.parquet | Bin 0 -> 1112 bytes .../folder-test/txt-csv/patient1.txt | 4 + .../folder-test/txt-csv/patient2.txt | 4 + .../folder-test/txt-csv/patient3.txt | 4 + .../folder-test/txt-ndjson/patient1.txt | 3 + .../folder-test/txt-ndjson/patient2.txt | 3 + .../folder-test/txt-ndjson/patient3.txt | 3 + .../single-file-test/patients-ndjson.txt | 11 + .../single-file-test/patients.csv | 12 + .../single-file-test/patients.json | 11 + .../single-file-test/patients.parquet | Bin 0 -> 2065 bytes .../single-file-test/patients.tsv | 11 + .../single-file-test/patients.txt | 12 + .../data/read/FileDataSourceReaderTest.scala | 471 ++++++++++++++++++ 22 files changed, 573 insertions(+) create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient1.csv create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient2.csv create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient3.csv create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient1.json create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient2.json create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient3.json create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/parquet/patients1.parquet create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/parquet/patients2.parquet create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/parquet/patients3.parquet create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-csv/patient1.txt create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-csv/patient2.txt create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-csv/patient3.txt create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-ndjson/patient1.txt create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-ndjson/patient2.txt create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/txt-ndjson/patient3.txt create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients-ndjson.txt create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.csv create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.json create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.parquet create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.tsv create mode 100644 tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.txt create mode 100644 tofhir-engine/src/test/scala/io/tofhir/test/engine/data/read/FileDataSourceReaderTest.scala diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient1.csv b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient1.csv new file mode 100644 index 00000000..2bf60499 --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient1.csv @@ -0,0 +1,5 @@ +! This line is a comment to test spark options +pid,gender,birthDate +p1,male,2000-05-10 +p2,male,1985-05-08 +p3,male,1997-02 \ No newline at end of file diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient2.csv b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient2.csv new file mode 100644 index 00000000..cf0b2bda --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient2.csv @@ -0,0 +1,5 @@ +! This line is a comment to test spark options +pid,gender,birthDate +p4,male,1999-06-05 +p5,male,1965-10-01 +p6,female,1991-03 \ No newline at end of file diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient3.csv b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient3.csv new file mode 100644 index 00000000..bde9dd4c --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/csv/patient3.csv @@ -0,0 +1,5 @@ +! This line is a comment to test spark options +pid,gender,birthDate +p7,female,1972-10-25 +p8,female,2010-01-10 +p9,female,1999-05-12 \ No newline at end of file diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient1.json b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient1.json new file mode 100644 index 00000000..28e95146 --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient1.json @@ -0,0 +1,3 @@ +{"pid": "p1", "gender": "male", "birthDate": "2000-05-10"}, +{"pid": "p2", "gender": "male", "birthDate": "1985-05-08"}, +{"pid": "p3", "gender": "male", "birthDate": "1997-02"} \ No newline at end of file diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient2.json b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient2.json new file mode 100644 index 00000000..e9aeb610 --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient2.json @@ -0,0 +1,3 @@ +{"pid": "p4", "gender": "male", "birthDate": "1999-06-05"}, +{"pid": "p5", "gender": "male", "birthDate": "1965-10-01"}, +{"pid": "p6", "gender": "female", "birthDate": "1991-03"} \ No newline at end of file diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient3.json b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient3.json new file mode 100644 index 00000000..6b5b9e9a --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/json/patient3.json @@ -0,0 +1,3 @@ +{"pid": "p7", "gender": "female", "birthDate": "1972-10-25"}, +{"pid": "p8", "gender": "female", "birthDate": "2010-01-10"}, +{"pid": "p9", "gender": "female", "birthDate": "1999-05-12"} \ No newline at end of file diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/parquet/patients1.parquet b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/folder-test/parquet/patients1.parquet new file mode 100644 index 0000000000000000000000000000000000000000..502c56472084891fdb402030faeeb16bbffb735b GIT binary patch literal 1091 zcmaJ>L2nX46drcBTO=INbS68=9(tg2D!5RIV61VV0#=G;leUI~2I+=UTUZw=@zQwm z2Y4|40)K%=e}E?s9{mme0Mqo~djn;c8lAwK`QE(uz4_itc2~MNYS1)IH>gR;EkZOx zh@Vq7uj~dTG*08|aG!#|X+P8a4NLQL5UA;CLX*5e%oMkDzkf&xGub&i7Y_xA1VV`v zO*N>dN*IuN+Bxxn*q}y)*g&tPEQAnqPHpzQ!J!u@_5mA+b!1+5g6HoJ(?^6Yr2)8}}k8gJB-ogf%a zUW~d&-f4%LFsC)5X0%2cHU=S{H8$!B#m&6T7xBob*)Nl>updiQ z5b;$DQ?8HjA|}4at__jBTc(WtUZPndei~tJ_@qd`FLm)j|4+x+MA}5eYphS&ZCu5kgw;X=>xq;TS?S4bbipoVS@{71mhv$oa9!_KX21O6hAxW5~>UZ^-i|rp+o8k7pEjqf+*sWl7y0m#!^d_0Tj}Zrj*1uDG^IY z)Tsj#V(7>xVB`ZZppJYJMh4zHJF(jePRYCX|98LlByS5h%QTi@nFgbb+$KaNgm^hA z^HLTVVJVin1Mx-pQ*%%C3cBj$AW=>dO+si; z`L=V^qj10Op#~7pIR>GjSEoS~(v*ZY$D#e+aKGnEs8fZXDgoG)5`b*7=8V+>^vIS< zCEF?3PCl>-c{}UaPBt0DvbJ;Y8r6lUNC$mZ0u`62-+VRjCkGpy2{3^H{7~R$9pVW7 z*}%A^8aPE1Fk*ufB3&V8LzjGFff&4jZ<&z^Q%qiG51|P$On#=adZ?aj>;gSsEE%BeZ^Y7T~CuLQ_ zFA|(uDY%w!Ygw8s?$chc?gl>4rWF9I|ve- zEg7$&(O%mPj3=I6d#s#iiATo%w#v=&;y;5)P;z!#u^7IAktbk{bM+2kggQ1ur5?$4?TMH z;GyyC!O!3)Fg^8?c=X)(-mrAHR-M3`_j`YSZ{B;!Zb~Q5be3h=7Nd;ZBt#>GcqJvv zO4^LD1WVik|04Wp=9%W%hUS$ZP>W{?OPNJN7fmDX4fYt}lpphB@lcRRAe2b6bc<=K z1cl63?%VEx3)DK(Bh++eH-Z}i3B4#l`|fbh^;L}{ek3fcaU}y7x7gOaog^^i?NT9^ zw{nFdqJjmgl?TJjXWKcen9CQgw3!Sejbz9H6qnk*{&L_?_Sd=-=mwfUi@{HL__Ti# zm^U;NCx`;ZYjQ%ADd@$Vw;ypu9$*>~;vgi$8|(ql5HrKiW2_mdCpvqMD$~qXBh~f& z(e%Z*w-2*t6c)Ek)Y6toD}zCZgBeFnVS|B{Q!``U7awDpi`87(eM2N7yXT)4SepO5 z#sm?kTA1?ZHqT??EcTau@~;bw^UsvsCE}MJ=7-OU%>6_c-{Sw7JpYblr5PoTt0@X5%rwXt(i{`w~tY1)L{RPGz+(n`nYt%8IfB1T5uaL|~fVfY#A6 zMf(N{MO7UmbxmRALanx@Fp6pq{Z*`5Z+g9D2fb4(V-|hcq}q?%+w(WGG^$|sp#OTC^8qBJoLRIC9;-lf_nGv-h1Es?%g{= z6;ZQjls=_Tw`h`*tAwb8kbxzoEoDZ^tduz^^HMHLS&-6+$S6(;pd{Z1k)RsYQuoz? zqpJf8tkH#py?yv2XmrWQ5iMcpkwLdj2}`nL1zz(9#U!B!pmm*I-+}>YMIms%*>g1* zKXXxoTrnC8<%;5}u6b5)Qx=XRvs~IE_n_?#Wx8hyd#0ElVYX8-knC#}-f1!70Aq8` z(>#~9IL<7mki!`-T#z$oIn8a);5ccX2b(v`V?>%;zMGZi8PBk6qrsMLHkUp#Y=nR~ z8-pNkCT&@BC5;C`dfalRc_8A95-;v`hvQDEIfimUp}&j3YyPa_*8dj=icp}1Y2lGH zWQ1)38(9LF0nKdMp5vBJ+(cC0s-y150SLCp!d0R~)Y+jV+HT8jj@&j9-0tpsBws1) zm11rK$Mf)jWrbF@pa-x$ur12m&SjTTTeo;FpGB>1WvomNwMH%rYfw!_S=IPebD154!4f{aw+%!0_l41MVx-uSLN|c}s_)d9`dXup z{ne8gJ@D1zi}X`escQ7rmzu+2fAVnD>cG9Dak#%Xh`hTu$W{uSaPHr)@KVrw!K$jy zoq5=Js>;5JC5@Ev!2DZD%{u4Qr{hR5W1&w6`|S#4?8PN&67yvwh?@Thyi?5=>fC35 z{e|&=UZOTJf4LCE2RjPgrq`Nz6g}rUu@eOQ_A*@~=G(;pY_1Rz(`){dnE5<*Zu_(E z6M*`=M18pwpq{Fr#Ps^~_@enw{LD^e&k^gVD>P5oH;XK3#$ZTdr|3(fR=a!e1Mlqh zUn95i;nkuLq6$BJJ{DpkRvw8>T!K*uwBl2Rxeyzu!qRkv*$(_lBV3olvjp}#B}h_0 zCs9IyPj%-LPXW2|80~j)-Yh%gM!k60%G5f0t9K3?We9_0Ym#49lqO;o{eXw$3+wPq zq!eOxZH6z3eE_{Joy)b}u3hi6?D~^hd5t&fwZYy!Nk1D_qgo)v!!8~%J3HxcmP znVtu%R?$UnMQ(MyIA{p*A<>ONbNKle?l?UhT|0D#qi+A;TGqM6Z*lS#K6(}SGw?6z CBbLMf literal 0 HcmV?d00001 diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.tsv b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.tsv new file mode 100644 index 00000000..987ba87d --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.tsv @@ -0,0 +1,11 @@ +pid gender birthDate deceasedDateTime homePostalCode +p1 male 2000-05-10 +p2 male 1985-05-08 2017-03-10 G02547 +p3 male 1997-02 +p4 male 1999-06-05 H10564 +p5 male 1965-10-01 2019-04-21 G02547 +p6 female 1991-03 +p7 female 1972-10-25 V13135 +p8 female 2010-01-10 Z54564 +p9 female 1999-05-12 +p10 female 2003-11 diff --git a/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.txt b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.txt new file mode 100644 index 00000000..dc6b243e --- /dev/null +++ b/tofhir-engine/src/test/resources/file-data-source-reader-test-data/single-file-test/patients.txt @@ -0,0 +1,12 @@ +! This file has trailing spaces for testing purposes +pid,gender,birthDate,deceasedDateTime ,homePostalCode +p1 ,male,2000-05-10,, +p2,male,1985-05-08,2017-03-10,G02547 +p3,male,1997-02,, +p4,male,1999-06-05,,H10564 +p5,male,1965-10-01,2019-04-21,G02547 +p6,female ,1991-03,, +p7 ,female,1972-10-25,,V13135 +p8,female,2010-01-10,,Z54564 +p9,female,1999-05-12,, +p10,female,2003-11 ,, diff --git a/tofhir-engine/src/test/scala/io/tofhir/test/engine/data/read/FileDataSourceReaderTest.scala b/tofhir-engine/src/test/scala/io/tofhir/test/engine/data/read/FileDataSourceReaderTest.scala new file mode 100644 index 00000000..a6e2be9f --- /dev/null +++ b/tofhir-engine/src/test/scala/io/tofhir/test/engine/data/read/FileDataSourceReaderTest.scala @@ -0,0 +1,471 @@ +package io.tofhir.test.engine.data.read + +import io.tofhir.engine.config.ToFhirConfig +import io.tofhir.engine.data.read.FileDataSourceReader +import io.tofhir.engine.model.{FileSystemSource, FileSystemSourceSettings, SourceFileFormats} +import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.scalatest.BeforeAndAfterAll +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.must.Matchers.{contain, include} +import org.scalatest.matchers.should.Matchers.convertToAnyShouldWrapper + +import java.nio.file.Paths +import java.sql.Timestamp +import java.text.SimpleDateFormat +import scala.language.postfixOps + +/** + * Unit tests for the FileDataSourceReader class. + * Tests the functionality of reading files in different formats. + */ +class FileDataSourceReaderTest extends AnyFlatSpec with BeforeAndAfterAll { + + /** + * SparkSession used for the test cases. + */ + val sparkSession: SparkSession = ToFhirConfig.sparkSession + /** + * Path to the directory containing test data for FileDataSourceReader. + */ + val testDataFolderPath: String = Paths.get(getClass.getResource("/file-data-source-reader-test-data").toURI).toAbsolutePath.toString + /** + * Instance of FileDataSourceReader used to read data files during tests. + */ + val fileDataSourceReader = new FileDataSourceReader(sparkSession) + /** + * Date format used for parsing and formatting date values in test cases. + */ + val dateFormat: SimpleDateFormat = new SimpleDateFormat("yyyy-MM-dd") + + override def beforeAll(): Unit = { + super.beforeAll() + } + + override def afterAll(): Unit = { + super.afterAll() + } + + /** + * Tests that the FileDataSourceReader correctly handles invalid input by throwing the appropriate exceptions. + * + * The test case covers two scenarios: + * 1. Providing a file path instead of a directory for a streaming job should result in an IllegalArgumentException. + * 2. Providing unsupported file formats or extensions should result in a NotImplementedError. + * + * The following configurations are used for the tests: + * - `illegalArgumentSourceBinding`: A source binding with a 'file' path to test the directory requirement for streaming jobs. + * - `unsupportedFileFormatSourceBinding`: A source binding with an unsupported file format to test the unsupported format handling. + * - `unsupportedExtensionSourceBinding`: A source binding with an unsupported file extension to test the unsupported extension handling. + * - `streamJobSourceSettings`: Mapping job source settings configured for a streaming job. + * - `batchJobSourceSettings`: Mapping job source settings configured for a batch job. + * + * The test verifies the following: + * 1. An IllegalArgumentException is thrown with the expected message when a file path is provided instead of a directory for a streaming job. + * 2. A NotImplementedError is thrown for unsupported file formats and extensions, indicating that these cases are not yet implemented or handled. + * + */ + it should "throw IllegalArgumentException, NotImplementedError when necessary" in { + // Folder including the test files belong to this test + val folderPath = "/single-file-test" + + // Test case 1: Verify that providing a file path instead of a directory throws an IllegalArgumentException + val fileName: String = "patients.csv" + val illegalArgumentSourceBinding = FileSystemSource(path = fileName) + val streamJobSourceSettings = FileSystemSourceSettings(name = "FileDataSourceReaderTest0", sourceUri = "test-uri", dataFolderPath = testDataFolderPath.concat(folderPath), asStream = true) + val exception = intercept[IllegalArgumentException] { + fileDataSourceReader.read(illegalArgumentSourceBinding, streamJobSourceSettings, Option.empty) + } + exception.getMessage should include(s"${fileName} is not a directory. For streaming job, you should provide a directory.") + + // Test case 2: Verify that unsupported file formats and extensions throw a NotImplementedError + val unsupportedFileFormatSourceBinding = FileSystemSource(path = fileName, fileFormat = Some("UNSUPPORTED")) + val unsupportedExtensionSourceBinding = FileSystemSource(path = "patients.UNSUPPORTED") + val batchJobSourceSettings = streamJobSourceSettings.copy(asStream = false) + assertThrows[NotImplementedError] { + fileDataSourceReader.read(unsupportedFileFormatSourceBinding, batchJobSourceSettings, Option.empty) + } + assertThrows[NotImplementedError] { + fileDataSourceReader.read(unsupportedExtensionSourceBinding, batchJobSourceSettings, Option.empty) + } + } + + /** + * Tests that the FileDataSourceReader correctly reads data from CSV, TSV, and TXT_CSV files. + * + * This test verifies that the reader can handle different file formats and produce the expected results. + * The test covers the following formats: + * 1. CSV + * 2. TSV + * 3. TXT_CSV (Text file with CSV-like format) + * + * The test uses the following source binding configurations: + * FileSystemSource(path = "patients.csv", fileFormat = None, options = Map("ignoreTrailingWhiteSpace" -> "true", "comment" -> "!")) + * FileSystemSource(path = "patients.tsv", fileFormat = None, options = Map("ignoreTrailingWhiteSpace" -> "true", "comment" -> "!")) + * FileSystemSource(path = "patients.txt", fileFormat = "txt-csv", options = Map("ignoreTrailingWhiteSpace" -> "true", "comment" -> "!")) + * + * The expected read result for all file formats: + * +---+------+-------------------+----------------+--------------+ + * |pid|gender| birthDate|deceasedDateTime|homePostalCode| + * +---+------+-------------------+----------------+--------------+ + * | p1| male|2000-05-10 00:00:00| NULL| NULL| + * | p2| male|1985-05-08 00:00:00| 2017-03-10| G02547| + * | p3| male|1997-02-01 00:00:00| NULL| NULL| + * | p4| male|1999-06-05 00:00:00| NULL| H10564| + * | p5| male|1965-10-01 00:00:00| 2019-04-21| G02547| + * | p6|female|1991-03-01 00:00:00| NULL| NULL| + * | p7|female|1972-10-25 00:00:00| NULL| V13135| + * | p8|female|2010-01-10 00:00:00| NULL| Z54564| + * | p9|female|1999-05-12 00:00:00| NULL| NULL| + * |p10|female|2003-11-01 00:00:00| NULL| NULL| + * +---+------+-------------------+----------------+--------------+ + * + */ + it should "correctly read from CSV, TSV, and TXT_CSV files" in { + // Folder containing the test files for this test + val folderPath = "/single-file-test" + + // Expected values for validation + val expectedRowNumber = 10 + val expectedColumns = Array("pid", "gender", "birthDate", "deceasedDateTime", "homePostalCode") + val expectedFirstRow = Row("p1", "male", new Timestamp(dateFormat.parse("2000-05-10").getTime), null, null) + val expectedLastRow = Row("p10", "female", new Timestamp(dateFormat.parse("2003-11-01").getTime), null, null) + + // A sequence of file names and their corresponding formats to be tested + val sourceBindingConfigurations = Seq( + ("patients.csv", None), + ("patients.tsv", None), + ("patients.txt", Some(SourceFileFormats.TXT_CSV)) + ) + + // Spark options to test if options are working + val sparkOptions = Map( + "ignoreTrailingWhiteSpace" -> "true", + "comment" -> "!" + ) + + // Loop through each source binding configuration to run the test + val mappingJobSourceSettings = FileSystemSourceSettings(name = "FileDataSourceReaderTest1", sourceUri = "test-uri", dataFolderPath = testDataFolderPath.concat(folderPath)) + sourceBindingConfigurations.foreach { case (fileName, fileFormat) => + // Read the data using the reader and the defined settings + val mappingSourceBinding = FileSystemSource(path = fileName, fileFormat = fileFormat, options = sparkOptions) + val result: DataFrame = fileDataSourceReader.read(mappingSourceBinding, mappingJobSourceSettings, Option.empty) + + // Validate the result + result.count() shouldBe expectedRowNumber + result.columns shouldBe expectedColumns + result.first() shouldBe expectedFirstRow + result.collect().last shouldBe expectedLastRow + } + } + + /** + * Tests that the FileDataSourceReader correctly reads multiple files from CSV and TXT_CSV folders. + * + * This test verifies that the reader can handle multiple files across different file formats + * and produce the expected results. The test covers reading from folders containing: + * 1. CSV files + * 2. TXT_CSV (Text files with CSV-like format) + * + * The test uses the following source binding configurations: + * FileSystemSource(path = "csv", fileFormat = Some(SourceFileFormats.CSV), options = Map("ignoreTrailingWhiteSpace" -> "true", "comment" -> "!")) + * FileSystemSource(path = "txt-csv", fileFormat = Some(SourceFileFormats.TXT_CSV), options = Map("ignoreTrailingWhiteSpace" -> "true", "comment" -> "!")) + * + * The expected read result for both folder formats: + * +---+------+-------------------+ + * |pid|gender| birthDate| + * +---+------+-------------------+ + * | p1| male|2000-05-10 00:00:00| + * | p2| male|1985-05-08 00:00:00| + * | p3| male|1997-02-01 00:00:00| + * | p4| male|1999-06-05 00:00:00| + * | p5| male|1965-10-01 00:00:00| + * | p6|female|1991-03-01 00:00:00| + * | p7|female|1972-10-25 00:00:00| + * | p8|female|2010-01-10 00:00:00| + * | p9|female|1999-05-12 00:00:00| + * +---+------+-------------------+ + * (Rows may appear in different groupings, with each file contributing a distinct set of 3 rows.) + * + */ + it should "correctly read multiple files from CSV, TXT_CSV folders" in { + // Folder including the test folders belong to this test + val folderPath = "/folder-test" + + // Expected values for validation + val expectedRowNumber = 9 + val expectedColumns = Array("pid", "gender", "birthDate") + val expectedRows = Set( // One row from each file + Row("p1", "male", new Timestamp(dateFormat.parse("2000-05-10").getTime)), + Row("p4", "male", new Timestamp(dateFormat.parse("1999-06-05").getTime)), + Row("p7", "female", new Timestamp(dateFormat.parse("1972-10-25").getTime)) + ) + + // A sequence of folder names and file format of the files to be selected + val sourceBindingConfigurations = Seq( + ("csv", Some(SourceFileFormats.CSV)), + ("txt-csv", Some(SourceFileFormats.TXT_CSV)) + ) + // Spark options to test if options are working + val sparkOptions = Map( + "ignoreTrailingWhiteSpace" -> "true", + "comment" -> "!" + ) + + // Loop through each source binding configuration to run the test + val mappingJobSourceSettings = FileSystemSourceSettings(name = "FileDataSourceReaderTest2", sourceUri = "test-uri", dataFolderPath = testDataFolderPath.concat(folderPath)) + sourceBindingConfigurations.foreach { case (folderName, fileFormat) => + // Read the data using the reader and the defined settings + val mappingSourceBinding = FileSystemSource(path = folderName, fileFormat = fileFormat, options = sparkOptions) + val result: DataFrame = fileDataSourceReader.read(mappingSourceBinding, mappingJobSourceSettings, Option.empty) + + // Validate the result + result.count() shouldBe expectedRowNumber + result.columns shouldBe expectedColumns + result.collect().toSet should contain allElementsOf expectedRows + } + } + + /** + * Tests that the FileDataSourceReader correctly reads data from JSON and TXT_NDJSON files. + * + * This test verifies that the reader can handle different file formats and produce the expected results. + * The test covers the following formats: + * 1. JSON + * 2. TXT_NDJSON (Text file with newline-delimited JSON format) + * + * The test uses the following source binding configurations: + * FileSystemSource(path = "patients.json", fileFormat = None, options = Map("allowComments" -> "true")) + * FileSystemSource(path = "patients-ndjson.txt", fileFormat = Some(SourceFileFormats.TXT_NDJSON), options = Map("allowComments" -> "true")) + * + * The expected read result is for both file formats: + * +------------+----------------+------+--------------+---+ + * | birthDate |deceasedDateTime|gender|homePostalCode|pid| + * +------------+----------------+------+--------------+---+ + * |2000-05-10 | NULL| male| NULL| p1| + * |1985-05-08 | 2017-03-10| male| G02547| p2| + * |1997-02 | NULL| male| NULL| p3| + * |1999-06-05 | NULL| male| H10564| p4| + * |1965-10-01 | 2019-04-21| male| G02547| p5| + * |1991-03 | NULL|female| NULL| p6| + * |1972-10-25 | NULL|female| V13135| p7| + * |2010-01-10 | NULL|female| Z54564| p8| + * |1999-05-12 | NULL|female| NULL| p9| + * |2003-11 | NULL|female| NULL|p10| + * +------------+----------------+------+--------------+---+ + * + */ + it should "correctly read from JSON and TXT-NDJSON files" in { + // Folder including the test files + val folderPath = "/single-file-test" + + // Define the expected values for validation (Note: Spark reads json columns in alphabetic order) + val expectedRowNumber = 10 + val expectedColumns = Array("birthDate", "deceasedDateTime", "gender", "homePostalCode", "pid") + val expectedFirstRow = Row("2000-05-10", null, "male", null, "p1") + val expectedLastRow = Row("2003-11", null, "female", null, "p10") + + // Define the file names and their corresponding formats to be tested + val sourceBindingConfigurations = Seq( + ("patients.json", None), + ("patients-ndjson.txt", Some(SourceFileFormats.TXT_NDJSON)) + ) + // Spark options to test if options are working + val sparkOptions = Map( + "allowComments" -> "true", + ) + + // Loop through each source binding configuration to run the test + val mappingJobSourceSettings = FileSystemSourceSettings(name = s"FileDataSourceReaderTest3", sourceUri = "test-uri", dataFolderPath = testDataFolderPath.concat(folderPath)) + sourceBindingConfigurations.foreach { case (fileName, fileFormat) => + // Define the source binding and settings for reading the file + val mappingSourceBinding = FileSystemSource(path = fileName, fileFormat = fileFormat, options = sparkOptions) + // Read the data from the specified file + val result: DataFrame = fileDataSourceReader.read(mappingSourceBinding, mappingJobSourceSettings, Option.empty) + + // Validate the result + result.count() shouldBe expectedRowNumber + result.columns shouldBe expectedColumns + result.first() shouldBe expectedFirstRow + result.collect().last shouldBe expectedLastRow + } + } + + /** + * Tests that the FileDataSourceReader correctly reads multiple files from JSON and NDJSON folders. + * + * This test verifies that the reader can handle multiple files across different file formats + * and produce the expected results. The test covers reading from folders containing: + * 1. JSON (standard JSON files in the "json" folder) + * 2. TXT_NDJSON (newline-delimited JSON files in the "txt-ndjson" folder) + * + * The test uses the following source binding configurations: + * FileSystemSource(path = "json", fileFormat = Some(SourceFileFormats.JSON)) + * FileSystemSource(path = "txt-ndjson", fileFormat = Some(SourceFileFormats.TXT_NDJSON)) + * + * The expected read result for both formats: + * +----------+------+---+ + * | birthDate|gender|pid| + * +----------+------+---+ + * |2000-05-10| male| p1| + * |1985-05-08| male| p2| + * | 1997-02| male| p3| + * |1999-06-05| male| p4| + * |1965-10-01| male| p5| + * | 1991-03|female| p6| + * |1972-10-25|female| p7| + * |2010-01-10|female| p8| + * |1999-05-12|female| p9| + * +----------+------+---+ + * (Rows may appear in different groupings, with each file contributing a distinct set of 3 rows.) + */ + it should "correctly read multiple files from JSON and NDJSON folders" in { + // Folder containing the test folders for JSON and NDJSON files + val folderPath = "/folder-test" + + // Expected values for validation + val expectedRowNumber = 9 + val expectedColumns = Array("birthDate", "gender", "pid") + // Expected rows for validation, one row from each file + val expectedRows = Set( + Row("2000-05-10", "male", "p1"), + Row("1999-06-05", "male", "p4"), + Row("1972-10-25", "female", "p7") + ) + + // A sequence of folder names and file format of the files to be selected + val sourceBindingConfigurations = Seq( + ("json", Some(SourceFileFormats.JSON)), + ("txt-ndjson", Some(SourceFileFormats.TXT_NDJSON)) + ) + + // Loop through each source binding configuration to run the test + val mappingJobSourceSettings = FileSystemSourceSettings(name = "FileDataSourceReaderTest4", sourceUri = "test-uri", dataFolderPath = testDataFolderPath.concat(folderPath)) + sourceBindingConfigurations.foreach { case (folderName, fileFormat) => + // Read the data using the reader and the defined settings + val mappingSourceBinding = FileSystemSource(path = folderName, fileFormat = fileFormat) + val result: DataFrame = fileDataSourceReader.read(mappingSourceBinding, mappingJobSourceSettings, Option.empty) + + // Validate the result + result.count() shouldBe expectedRowNumber + result.columns shouldBe expectedColumns + result.collect().toSet should contain allElementsOf expectedRows + } + } + + /** + * Tests that the FileDataSourceReader correctly reads data from a Parquet file. + * + * This test verifies that the reader can handle Parquet file format and produce the expected results. + * The test covers the following format: + * 1. PARQUET + * + * The test uses the following source binding configuration: + * FileSystemSource(path = "patients.parquet", fileFormat = Some(SourceFileFormats.PARQUET)) + * + * The expected read result for the Parquet file is: + * +---+------+----------+-------------------+--------------+ + * |pid|gender| birthDate|deceasedDateTime |homePostalCode| + * +---+------+----------+-------------------+--------------+ + * | p1| male|2000-05-10| null| null| + * | p2| male|1985-05-08|2017-03-10 | G02547| + * | p3| male|1997-02-01| null| null| + * | p4| male|1999-06-05| null| H10564| + * | p5| male|1965-10-01|2019-04-21 | G02547| + * | p6|female|1991-03-01| null| null| + * | p7|female|1972-10-25| null| V13135| + * | p8|female|2010-01-10| null| Z54564| + * | p9|female|1999-05-12| null| null| + * |p10|female|2003-11 | null| null| + * +---+------+----------+-------------------+--------------+ + * + */ + it should "correctly read from Parquet file" in { + // Folder including the test files + val folderPath = "/single-file-test" + + // Define the expected values for validation + val expectedRowNumber = 10 + val expectedColumns = Array("pid", "gender", "birthDate", "deceasedDateTime", "homePostalCode") + val expectedFirstRow = Row("p1", "male", "2000-05-10", null, null) + val expectedLastRow = Row("p10", "female", "2003-11", null, null) + + // Define the file name and its corresponding format for Parquet + val sourceBindingConfigurations = Seq( + ("patients.parquet", Some(SourceFileFormats.PARQUET)) + ) + + // Loop through each source binding configuration to run the test + val mappingJobSourceSettings = FileSystemSourceSettings(name = s"FileDataSourceReaderTest5", sourceUri = "test-uri", dataFolderPath = testDataFolderPath.concat(folderPath)) + sourceBindingConfigurations.foreach { case (fileName, fileFormat) => + // Define the source binding and settings for reading the file + val mappingSourceBinding = FileSystemSource(path = fileName, fileFormat = fileFormat) + // Read the data from the specified file + val result: DataFrame = fileDataSourceReader.read(mappingSourceBinding, mappingJobSourceSettings, Option.empty) + + // Validate the result + result.count() shouldBe expectedRowNumber + result.columns shouldBe expectedColumns + result.first() shouldBe expectedFirstRow + result.collect().last shouldBe expectedLastRow + } + } + + /** + * Tests that the FileDataSourceReader correctly reads data from Parquet files. + * + * This test verifies that the reader can handle Parquet file format and produce the expected results. + * The test covers the following format: + * 1. PARQUET + * + * The test uses the following source binding configuration: + * FileSystemSource(path = "parquet", fileFormat = Some(SourceFileFormats.PARQUET)) + * + * The expected read result for the Parquet files is: + * +---+------+----------+ + * |pid|gender| birthDate| + * +---+------+----------+ + * | p1| male|2000-05-10| + * | p2| male|1985-05-08| + * | p3| male|1997-02-01| + * | p4| male|1999-06-05| + * | p5| male|1965-10-01| + * | p6|female|1991-03-01| + * | p7|female|1972-10-25| + * | p8|female|2010-01-10| + * | p9|female|1999-05-12| + * +---+------+----------+ + * (Rows may appear in different order, grouped by each file.) + */ + it should "correctly read multiple files from Parquet folders" in { + // Folder including the test folders belonging to this test + val folderPath = "/folder-test" + + // Expected values for validation + val expectedRowNumber = 9 + val expectedColumns = Array("pid", "gender", "birthDate") + // Expected rows for validation, one row from each file + val expectedRows = Set( + Row("p1", "male", "2000-05-10"), + Row("p4", "male", "1999-06-05"), + Row("p7", "female", "1972-10-25") + ) + + // A sequence of folder names and file format of the files to be selected + val sourceBindingConfigurations = Seq( + ("parquet", Some(SourceFileFormats.PARQUET)) + ) + + // Loop through each source binding configuration to run the test + val mappingJobSourceSettings = FileSystemSourceSettings(name = "FileDataSourceReaderTest6", sourceUri = "test-uri", dataFolderPath = testDataFolderPath.concat(folderPath)) + sourceBindingConfigurations.foreach { case (folderName, fileFormat) => + // Read the data using the reader and the defined settings + val mappingSourceBinding = FileSystemSource(path = folderName, fileFormat = fileFormat) + val result: DataFrame = fileDataSourceReader.read(mappingSourceBinding, mappingJobSourceSettings, Option.empty) + + // Validate the result + result.count() shouldBe expectedRowNumber + result.columns shouldBe expectedColumns + result.collect().toSet should contain allElementsOf expectedRows + } + } +} \ No newline at end of file