apache · FelixYBW · Jan 23, 2025 · Jan 9, 2025 · Jan 9, 2025 · Jan 9, 2025
diff --git a/pom.xml b/pom.xml
@@ -402,6 +402,16 @@
       </properties>
     </profile>
 
+    <profile>
+      <id>qualification-tool</id>
+      <activation>
+        <activeByDefault>false</activeByDefault>
+      </activation>
+      <modules>
+        <module>tools/qualification-tool</module>
+      </modules>
+    </profile>
+
     <profile>
       <id>backends-clickhouse</id>
       <activation>

diff --git a/tools/qualification-tool/.gitignore b/tools/qualification-tool/.gitignore
@@ -0,0 +1 @@
+target
diff --git a/tools/qualification-tool/README.MD b/tools/qualification-tool/README.MD
@@ -0,0 +1,162 @@
+# Qualification Tool
+
+The Qualification Tool analyzes Spark event log files to determine the compatibility and performance of SQL workloads with Gluten.
+
+## Build
+
+To compile and package the Qualification Tool, run the following Maven command:
+
+```bash
+mvn clean package
+```
+
+This will create a jar file in the `target` directory.
+
+## Run
+
+To execute the tool, use the following command:
+
+```bash
+java -jar target/qualification-tool-1.3.0-SNAPSHOT-jar-with-dependencies.jar -f <eventFile>
+```
+
+### Parameters:
+- **`-f <eventFile>`**: Path to the Spark event log file(s). This can be:
+  - A single event log file.
+  - A folder containing multiple event log files.
+  - Deeply nested folders of event log files.
+  - Compressed event log files.
+  - Rolling event log files.
+  - Comma separated files
+- **`-k <gcsKey>`**: (Optional) Path to Google Cloud Storage service account keys.
+- **`-o <output>`**: (Optional) Path to the directory where output will be written. Defaults to a temporary directory.
+- **`-t <threads>`**: (Optional) Number of processing threads. Defaults to 4.
+- **`-v`**: (Optional) Enable non verbose output. Omit this flag for verbose mode.
+- **`-p <project>`**: (Optional) Project ID for the run.
+- **`-d <dateFilter>`**: (Optional) Analyze only files created after this date (format: YYYY-MM-DD). Defaults to the last 90 days.
+
+### Example Usage:
+```bash
+java -jar target/qualification-tool-1.3.0-SNAPSHOT-jar-with-dependencies.jar -f /path/to/eventlog
+```
+
+### Advanced Example:
+```bash
+java -jar target/qualification-tool-1.3.0-SNAPSHOT-jar-with-dependencies.jar -f /path/to/folder -o /output/path -t 8 -d 2023-01-01 -k /path/to/gcs_keys.json -p my_project
+```
+
+## Features
+
+- Analyzes Spark SQL execution plans for compatibility with Gluten.
+- Supports single files, folders, deeply nested folders, compressed files, and rolling event logs.
+- Provides detailed reports on supported and unsupported operations.
+- Generates metrics on SQL execution times and operator impact.
+- Configurable verbosity and threading.
+
+## How It Works
+
+The Qualification Tool analyzes a Spark plan to determine the compatibility of its nodes / operators and clusters with Gluten. Here's a step-by-step explanation of the process:
+
+### Example Spark Plan
+
+Consider the following Spark plan:
+
+```
+            G
+        /        \
+      G[2]       G[3]
+       |            |
+      S[2]       G[3]
+       |            |
+      G          S
+    /    \
+  G[1]    G
+  |
+  G[1]
+  |
+  G
+```
+
+- **G**: Represents a plan supported by Gluten.
+- **S**: Represents a plan not supported by Gluten.
+- **[1], [2], [3]**: Indicates the node belongs to a Whole Stage Code Gen Block (Cluster).
+
+### 1. NodeSupportVisitor
+
+The first step is marking each node as supported (`*`) or not supported (`!`) by Gluten:
+
+```
+            *G
+        /        \
+      *G[2]       *G[3]
+       |            |
+      !S[2]       *G[3]
+       |            |
+      *G           !S
+    /    \
+  *G[1]    *G
+  |
+  *G[1]
+  |
+  *G
+```
+
+- All supported nodes are marked with `*`.
+- All unsupported nodes are marked with `!`.
+
+### 2. ClusterSupportVisitor
+
+The second step marks entire clusters as not supported (`!`) if any node in the cluster is unsupported:
+
+```
+            *G
+        /        \
+      !G[2]       *G[3]
+       |            |
+      !S[2]       *G[3]
+       |            |
+      *G           !S
+    /    \
+  *G[1]    *G
+  |
+  *G[1]
+  |
+  *G
+```
+
+#### Reasoning:
+Although Gluten supports these operators, breaking Whole Stage Code Gen (WSCG) boundaries introduces row-to-columnar and columnar-to-row conversions, degrading performance. Hence, we pessimistically mark the entire cluster as not supported.
+
+### 3. ChildSupportVisitor
+
+The final step marks nodes and their parents as not supported if their children are unsupported:
+
+```
+            !G
+        /        \
+      !G[2]       !G[3]
+       |            |
+      !S[2]       !G[3]
+       |            |
+      *G           !S
+    /    \
+  *G[1]    *G
+  |
+  *G[1]
+  |
+  *G
+```
+
+#### Reasoning:
+If a child node is not supported by Gluten, row-to-columnar and columnar-to-row conversions are added, degrading performance. Therefore, we pessimistically mark such nodes as not supported.
+
+### Summary
+
+The tool ensures that the Spark plan optimizes performance by:
+1. Identifying individual node compatibility.
+2. Accounting for cluster boundaries and WSCG optimizations.
+3. Considering child dependencies and their impact on parent nodes.
+
+## Requirements
+
+- **Java**: Ensure you have JDK 11 or later installed.
diff --git a/tools/qualification-tool/pom.xml b/tools/qualification-tool/pom.xml
@@ -0,0 +1,205 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <groupId>org.apache.gluten</groupId>
+  <artifactId>qualification-tool</artifactId>
+  <version>1.3.0-SNAPSHOT</version>
+  <name>Qualification Tool</name>
+
+  <properties>
+    <maven.compiler.source>11</maven.compiler.source>
+    <maven.compiler.target>11</maven.compiler.target>
+    <encoding>UTF-8</encoding>
+    <scala.version>2.12.15</scala.version>
+    <scala.binary.version>2.12</scala.binary.version>
+    <spec2.version>4.2.0</spec2.version>
+    <spotless.version>2.27.2</spotless.version>
+    <spotless.scalafmt.version>3.8.3</spotless.scalafmt.version>
+    <spotless.delimiter>package</spotless.delimiter>
+    <spotless.license.header>
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+    </spotless.license.header>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_2.12</artifactId>
+      <version>3.5.0</version>
+    </dependency>
+
+    <dependency>
+      <groupId>com.google.cloud.bigdataoss</groupId>
+      <artifactId>gcs-connector</artifactId>
+      <version>hadoop2-2.2.24</version>
+    </dependency>
+
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-library</artifactId>
+      <version>${scala.version}</version>
+    </dependency>
+
+    <dependency>
+      <groupId>commons-cli</groupId>
+      <artifactId>commons-cli</artifactId>
+      <version>1.4</version>
+    </dependency>
+
+    <dependency>
+      <groupId>org.jspecify</groupId>
+      <artifactId>jspecify</artifactId>
+      <version>1.0.0</version>
+    </dependency>
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.scala-tools</groupId>
+        <artifactId>maven-scala-plugin</artifactId>
+        <version>2.15.2</version>
+        <executions>
+          <execution>
+            <goals>
+              <goal>compile</goal>
+            </goals>
+          </execution>
+        </executions>
+        <configuration>
+          <sourceDir>src/main/scala</sourceDir>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>3.8.1</version>
+        <executions>
+          <execution>
+            <id>default-compile</id>
+            <phase>compile</phase>
+            <goals>
+              <goal>compile</goal>
+            </goals>
+            <configuration>
+              <source>11</source>
+              <target>11</target>
+            </configuration>
+          </execution>
+          <execution>
+            <id>default-testCompile</id>
+            <phase>test-compile</phase>
+            <goals>
+              <goal>testCompile</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <configuration>
+          <archive>
+            <manifest>
+              <mainClass>org.apache.gluten.qt.QualificationTool</mainClass>
+            </manifest>
+          </archive>
+          <descriptorRefs>
+            <descriptorRef>jar-with-dependencies</descriptorRef>
+          </descriptorRefs>
+        </configuration>
+        <executions>
+          <execution>
+            <id>make-assembly</id> <!-- this is used for an identifier -->
+            <phase>package</phase> <!-- bind to the packaging phase -->
+            <goals>
+              <goal>single</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <version>3.4.2</version>
+        <configuration>
+          <archive>
+            <manifest>
+              <addClasspath>true</addClasspath>
+              <mainClass>org.apache.gluten.qt.QualificationTool</mainClass>
+            </manifest>
+          </archive>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-surefire-plugin</artifactId>
+        <version>2.21.0</version>
+        <configuration>
+          <!-- Tests will be run with scalatest-maven-plugin instead -->
+          <skipTests>true</skipTests>
+        </configuration>
+      </plugin>
+
+      <plugin>
+        <groupId>com.diffplug.spotless</groupId>
+        <artifactId>spotless-maven-plugin</artifactId>
+        <version>${spotless.version}</version>
+        <configuration>
+          <java>
+            <toggleOffOn />
+            <googleJavaFormat>
+              <version>1.7</version>
+            </googleJavaFormat>
+
+            <!-- \# refers to the static imports -->
+            <importOrder>
+              <order>org.apache.gluten,io.substrait.spark,,javax,java,scala,\#</order>
+            </importOrder>
+
+            <removeUnusedImports />
+            <licenseHeader>
+              <content>${spotless.license.header}</content>
+              <delimiter>${spotless.delimiter}</delimiter>
+            </licenseHeader>
+          </java>
+          <scala>
+            <!--  make it works `// spotless:off `  -->
+            <toggleOffOn />
+            <scalafmt>
+              <version>${spotless.scalafmt.version}</version>
+              <scalaMajorVersion>${scala.binary.version}</scalaMajorVersion>
+              <file>../../.scalafmt.conf</file>
+            </scalafmt>
+            <licenseHeader>
+              <content>${spotless.license.header}</content>
+              <delimiter>${spotless.delimiter}</delimiter>
+            </licenseHeader>
+          </scala>
+        </configuration>
+        <executions>
+          <execution>
+            <id>spotless-check</id>
+            <phase>validate</phase>
+            <goals>
+              <goal>check</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+</project>