[Spark] Optimize fileInfo Scan (lakesoul-io#549)

* optimize fileInfo Scan Signed-off-by: zenghua <[email protected]> * cleanup code Signed-off-by: zenghua <[email protected]> --------- Signed-off-by: zenghua <[email protected]> Co-authored-by: zenghua <[email protected]>
F-PHantam · Oct 11, 2024 · bb19636 · bb19636
1 parent 068f935
commit bb19636
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 6 deletions.
diff --git a/...src/main/scala/org/apache/spark/sql/execution/datasources/v2/merge/MergeParquetScan.scala b/...src/main/scala/org/apache/spark/sql/execution/datasources/v2/merge/MergeParquetScan.scala
@@ -265,16 +265,25 @@ abstract class MergeDeltaParquetScan(sparkSession: SparkSession,
           })
       }
 
+      val fileInfoSeq = if (isStreaming) newFileIndex.getFileInfoForPartitionVersion() else fileInfo
+      val dataInfoPath = fileInfoSeq.mkString(",")
+      val fs = partition.files.head.getPath
+        .getFileSystem(sparkSession.sessionState.newHadoopConf())
+      val pathToFileInfoMap = fileInfoSeq.map(f => fs.makeQualified(new Path(f.path)).toString -> f).toMap
+
       partition.files.flatMap { file =>
         val filePath = file.getPath
+        val qualifiedPath = fs.makeQualified(filePath).toString
+
+        val touchedFileInfo = pathToFileInfoMap.getOrElse(qualifiedPath, throw LakeSoulErrors.filePathNotFoundException(qualifiedPath, dataInfoPath))
 
         MergePartitionedFileUtil.notSplitFiles(
           sparkSession,
           file,
           filePath,
           partitionValues,
           tableInfo,
-          fileInfo = if (isStreaming) newFileIndex.getFileInfoForPartitionVersion() else fileInfo,
+          touchedFileInfo,
           requestFilesSchemaMap,
           readDataSchema,
           readPartitionSchema.fieldNames)

diff --git a/.../scala/org/apache/spark/sql/execution/datasources/v2/merge/MergePartitionedFileUtil.scala b/.../scala/org/apache/spark/sql/execution/datasources/v2/merge/MergePartitionedFileUtil.scala
@@ -18,7 +18,7 @@ object MergePartitionedFileUtil {
                     filePath: Path,
                     partitionValues: InternalRow,
                     tableInfo: TableInfo,
-                    fileInfo: Seq[DataFileInfo],
+                    touchedFileInfo: DataFileInfo,
                     requestFilesSchemaMap: Map[String, StructType],
                     requestDataSchema: StructType,
                     requestPartitionFields: Array[String]): Seq[MergePartitionedFile] = {
@@ -28,7 +28,7 @@ object MergePartitionedFileUtil {
       filePath,
       partitionValues,
       tableInfo,
-      fileInfo,
+      touchedFileInfo,
       requestFilesSchemaMap,
       requestDataSchema,
       requestPartitionFields))
@@ -39,7 +39,7 @@ object MergePartitionedFileUtil {
                          filePath: Path,
                          partitionValues: InternalRow,
                          tableInfo: TableInfo,
-                         fileInfo: Seq[DataFileInfo],
+                         touchedFileInfo: DataFileInfo,
                          requestFilesSchemaMap: Map[String, StructType],
                          requestDataSchema: StructType,
                          requestPartitionFields: Array[String]): MergePartitionedFile = {
@@ -49,8 +49,6 @@ object MergePartitionedFileUtil {
       .getFileSystem(sparkSession.sessionState.newHadoopConf())
     val filePathStr = fs
       .makeQualified(filePath).toString
-    val touchedFileInfo = fileInfo.find(f => filePathStr.equals(fs.makeQualified(new Path(f.path)).toString))
-      .getOrElse(throw LakeSoulErrors.filePathNotFoundException(filePathStr, fileInfo.mkString(",")))
 
     val touchedFileSchema = requestFilesSchemaMap(touchedFileInfo.range_version).fieldNames