-
Notifications
You must be signed in to change notification settings - Fork 2.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add support for Redshift spark connector
Small fix for openlineage converter to get platform properly for SchemaField
- Loading branch information
Showing
13 changed files
with
644 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
9 changes: 9 additions & 0 deletions
9
...park-lineage-beta/src/main/java/io/openlineage/spark/agent/vendor/redshift/Constants.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
package io.openlineage.spark.agent.vendor.redshift; | ||
|
||
public class Constants { | ||
public static final String REDSHIFT_CLASS_NAME = | ||
"io.github.spark_redshift_community.spark.redshift.RedshiftRelation"; | ||
|
||
public static final String REDSHIFT_PROVIDER_CLASS_NAME = | ||
"io.github.spark_redshift_community.spark.redshift.DefaultSource"; | ||
} |
56 changes: 56 additions & 0 deletions
56
...lineage-beta/src/main/java/io/openlineage/spark/agent/vendor/redshift/RedshiftVendor.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
package io.openlineage.spark.agent.vendor.redshift; | ||
|
||
import static io.openlineage.spark.agent.vendor.redshift.Constants.*; | ||
|
||
import io.openlineage.spark.agent.lifecycle.VisitorFactory; | ||
import io.openlineage.spark.agent.vendor.redshift.lifecycle.RedshiftRelationVisitor; | ||
import io.openlineage.spark.agent.vendor.redshift.lifecycle.plan.RedshiftEventHandlerFactory; | ||
import io.openlineage.spark.agent.vendor.snowflake.lifecycle.SnowflakeVisitorFactory; | ||
import io.openlineage.spark.api.OpenLineageEventHandlerFactory; | ||
import io.openlineage.spark.api.Vendor; | ||
import java.util.Optional; | ||
import lombok.extern.slf4j.Slf4j; | ||
|
||
@Slf4j | ||
public class RedshiftVendor implements Vendor { | ||
|
||
public static boolean hasRedshiftClasses() { | ||
/* | ||
Checking the Redshift class with both | ||
SnowflakeRelationVisitor.class.getClassLoader.loadClass and | ||
Thread.currentThread().getContextClassLoader().loadClass. The first checks if the class is | ||
present on the classpath, and the second one is a catchall which captures if the class has | ||
been installed. This is relevant for Azure Databricks where jars can be installed and | ||
accessible to the user, even if they are not present on the classpath. | ||
*/ | ||
try { | ||
RedshiftRelationVisitor.class.getClassLoader().loadClass(REDSHIFT_PROVIDER_CLASS_NAME); | ||
return true; | ||
} catch (Exception e) { | ||
// swallow - we don't care | ||
} | ||
try { | ||
Thread.currentThread().getContextClassLoader().loadClass(REDSHIFT_PROVIDER_CLASS_NAME); | ||
return true; | ||
} catch (Exception e) { | ||
// swallow - we don't care | ||
} | ||
return false; | ||
} | ||
|
||
@Override | ||
public boolean isVendorAvailable() { | ||
log.info("Checking if Redshift classes are available"); | ||
return hasRedshiftClasses(); | ||
} | ||
|
||
@Override | ||
public Optional<VisitorFactory> getVisitorFactory() { | ||
return Optional.of(new SnowflakeVisitorFactory()); | ||
} | ||
|
||
@Override | ||
public Optional<OpenLineageEventHandlerFactory> getEventHandlerFactory() { | ||
return Optional.of(new RedshiftEventHandlerFactory()); | ||
} | ||
} |
72 changes: 72 additions & 0 deletions
72
...a/src/main/java/io/openlineage/spark/agent/vendor/redshift/lifecycle/RedshiftDataset.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
package io.openlineage.spark.agent.vendor.redshift.lifecycle; | ||
|
||
import io.openlineage.client.OpenLineage; | ||
import io.openlineage.spark.agent.util.SqlUtils; | ||
import io.openlineage.spark.api.DatasetFactory; | ||
import java.net.URI; | ||
import java.net.URISyntaxException; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Optional; | ||
import lombok.extern.slf4j.Slf4j; | ||
import org.apache.spark.sql.types.StructType; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
@Slf4j | ||
public class RedshiftDataset { | ||
public static final String REDSHIFT_PREFIX = "redshift://"; | ||
|
||
private static final Logger logger = LoggerFactory.getLogger(RedshiftDataset.class); | ||
public static final String DEFAULT_SCHEMA = "public"; | ||
|
||
public static <D extends OpenLineage.Dataset> List<D> getDatasets( | ||
DatasetFactory<D> factory, | ||
String url, | ||
Optional<String> dbtable, | ||
Optional<String> query, | ||
StructType schema) | ||
throws URISyntaxException { | ||
|
||
URI jdbcUrl = | ||
new URI( | ||
REDSHIFT_PREFIX | ||
+ url.replace("jdbc:redshift:iam://", "").replace("jdbc:redshift://", "")); | ||
String db = jdbcUrl.getPath().substring(1); // remove leading slash | ||
final String namespace = | ||
jdbcUrl.getScheme() + "://" + jdbcUrl.getHost() + ":" + jdbcUrl.getPort(); | ||
|
||
final String tableName; | ||
// https://github.com/databricks/spark-redshift?tab=readme-ov-file | ||
// > Specify one of the following options for the table data to be read: | ||
// > - `dbtable`: The name of the table to be read. All columns and records are retrieved | ||
// > (i.e. it is equivalent to SELECT * FROM db_table). | ||
// > - `query`: The exact query (SELECT statement) to run. | ||
// If dbtable is null it will be replaced with the string `complex` and it means the query | ||
// option was used. | ||
// An improvement could be put the query string in the `DatasetFacets` | ||
if (dbtable.isPresent()) { | ||
tableName = dbtable.get(); | ||
String[] splits = tableName.split("\\."); | ||
String table = tableName; | ||
if (splits.length == 1) { | ||
table = String.format("%s.%s.%s", db, DEFAULT_SCHEMA, tableName); | ||
} else if (splits.length == 2) { | ||
table = String.format("%s.%s", db, tableName); | ||
} else if (splits.length == 3) { | ||
table = tableName; | ||
} else { | ||
logger.warn("Redshift getDataset: tableName: {} is not in the expected format", tableName); | ||
return Collections.emptyList(); | ||
} | ||
|
||
return Collections.singletonList(factory.getDataset(table, namespace, schema)); | ||
} else if (query.isPresent()) { | ||
return SqlUtils.getDatasets(factory, query.get(), "redshift", namespace, db, DEFAULT_SCHEMA); | ||
} else { | ||
logger.warn( | ||
"Unable to discover Redshift table property - neither \"dbtable\" nor \"query\" option present"); | ||
} | ||
return Collections.emptyList(); | ||
} | ||
} |
66 changes: 66 additions & 0 deletions
66
...in/java/io/openlineage/spark/agent/vendor/redshift/lifecycle/RedshiftRelationVisitor.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
package io.openlineage.spark.agent.vendor.redshift.lifecycle; | ||
|
||
import io.github.spark_redshift_community.spark.redshift.Parameters; | ||
import io.github.spark_redshift_community.spark.redshift.RedshiftRelation; | ||
import io.github.spark_redshift_community.spark.redshift.TableName; | ||
import io.openlineage.client.OpenLineage; | ||
import io.openlineage.spark.agent.util.ScalaConversionUtils; | ||
import io.openlineage.spark.api.DatasetFactory; | ||
import io.openlineage.spark.api.OpenLineageContext; | ||
import io.openlineage.spark.api.QueryPlanVisitor; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Optional; | ||
import lombok.NonNull; | ||
import lombok.extern.slf4j.Slf4j; | ||
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan; | ||
import org.apache.spark.sql.execution.datasources.LogicalRelation; | ||
import org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand; | ||
|
||
/** | ||
* {@link LogicalPlan} visitor that matches {@link SaveIntoDataSourceCommand}s that use a {@link | ||
* RedshiftRelation}. This function extracts a {@link OpenLineage.Dataset} from the Redshift table | ||
* referenced by the relation. | ||
*/ | ||
@Slf4j | ||
public class RedshiftRelationVisitor<D extends OpenLineage.Dataset> | ||
extends QueryPlanVisitor<LogicalRelation, D> { | ||
private static final String REDSHIFT_NAMESPACE = "redshift"; | ||
private static final String REDSHIFT_CLASS_NAME = | ||
"io.github.spark_redshift_community.spark.redshift.RedshiftRelation"; | ||
private final DatasetFactory<D> factory; | ||
|
||
public RedshiftRelationVisitor(@NonNull OpenLineageContext context, DatasetFactory<D> factory) { | ||
super(context); | ||
this.factory = factory; | ||
log.info("RedshiftRelationVisitor created"); | ||
} | ||
|
||
@Override | ||
public List<D> apply(LogicalPlan x) { | ||
RedshiftRelation relation = (RedshiftRelation) ((LogicalRelation) x).relation(); | ||
Parameters.MergedParameters params = relation.params(); | ||
Optional<String> dbtable = | ||
(Optional<String>) | ||
ScalaConversionUtils.asJavaOptional(params.table().map(TableName::toString)); | ||
Optional<String> query = ScalaConversionUtils.asJavaOptional(params.query()); | ||
return Collections.singletonList( | ||
factory.getDataset(dbtable.orElse(""), REDSHIFT_NAMESPACE, relation.schema())); | ||
} | ||
|
||
protected boolean isRedshiftClass(LogicalPlan plan) { | ||
try { | ||
Class c = Thread.currentThread().getContextClassLoader().loadClass(REDSHIFT_CLASS_NAME); | ||
return (plan instanceof LogicalRelation | ||
&& c.isAssignableFrom(((LogicalRelation) plan).relation().getClass())); | ||
} catch (Exception e) { | ||
// swallow - not a snowflake class | ||
} | ||
return false; | ||
} | ||
|
||
@Override | ||
public boolean isDefinedAt(LogicalPlan plan) { | ||
return isRedshiftClass(plan); | ||
} | ||
} |
26 changes: 26 additions & 0 deletions
26
...ain/java/io/openlineage/spark/agent/vendor/redshift/lifecycle/RedshiftVisitorFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
package io.openlineage.spark.agent.vendor.redshift.lifecycle; | ||
|
||
import io.openlineage.client.OpenLineage; | ||
import io.openlineage.spark.agent.lifecycle.VisitorFactory; | ||
import io.openlineage.spark.api.DatasetFactory; | ||
import io.openlineage.spark.api.OpenLineageContext; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan; | ||
import scala.PartialFunction; | ||
|
||
public class RedshiftVisitorFactory implements VisitorFactory { | ||
@Override | ||
public List<PartialFunction<LogicalPlan, List<OpenLineage.InputDataset>>> getInputVisitors( | ||
OpenLineageContext context) { | ||
DatasetFactory<OpenLineage.InputDataset> factory = DatasetFactory.input(context); | ||
return Collections.singletonList(new RedshiftRelationVisitor<>(context, factory)); | ||
} | ||
|
||
@Override | ||
public List<PartialFunction<LogicalPlan, List<OpenLineage.OutputDataset>>> getOutputVisitors( | ||
OpenLineageContext context) { | ||
DatasetFactory<OpenLineage.OutputDataset> factory = DatasetFactory.output(context); | ||
return Collections.singletonList(new RedshiftRelationVisitor<>(context, factory)); | ||
} | ||
} |
20 changes: 20 additions & 0 deletions
20
...o/openlineage/spark/agent/vendor/redshift/lifecycle/plan/RedshiftEventHandlerFactory.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
package io.openlineage.spark.agent.vendor.redshift.lifecycle.plan; | ||
|
||
import io.openlineage.client.OpenLineage; | ||
import io.openlineage.spark.api.OpenLineageContext; | ||
import io.openlineage.spark.api.OpenLineageEventHandlerFactory; | ||
import java.util.Collection; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import scala.PartialFunction; | ||
|
||
public class RedshiftEventHandlerFactory implements OpenLineageEventHandlerFactory { | ||
@Override | ||
public Collection<PartialFunction<Object, List<OpenLineage.OutputDataset>>> | ||
createOutputDatasetBuilder(OpenLineageContext context) { | ||
// The right function will be determined at runtime by using type checking based on the correct | ||
// Spark LogicalPlan | ||
return Collections.singleton( | ||
(PartialFunction) new RedshiftSaveIntoDataSourceCommandBuilder(context)); | ||
} | ||
} |
80 changes: 80 additions & 0 deletions
80
.../spark/agent/vendor/redshift/lifecycle/plan/RedshiftSaveIntoDataSourceCommandBuilder.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
package io.openlineage.spark.agent.vendor.redshift.lifecycle.plan; | ||
|
||
import static io.openlineage.spark.agent.vendor.redshift.RedshiftVendor.hasRedshiftClasses; | ||
|
||
import io.openlineage.client.OpenLineage; | ||
import io.openlineage.spark.agent.util.PlanUtils; | ||
import io.openlineage.spark.agent.util.ScalaConversionUtils; | ||
import io.openlineage.spark.agent.vendor.redshift.lifecycle.RedshiftDataset; | ||
import io.openlineage.spark.api.AbstractQueryPlanDatasetBuilder; | ||
import io.openlineage.spark.api.OpenLineageContext; | ||
import java.net.URISyntaxException; | ||
import java.util.Collections; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Optional; | ||
import lombok.extern.slf4j.Slf4j; | ||
import org.apache.spark.scheduler.SparkListenerEvent; | ||
import org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand; | ||
import org.apache.spark.sql.sources.CreatableRelationProvider; | ||
import org.apache.spark.sql.types.StructType; | ||
|
||
@Slf4j | ||
public class RedshiftSaveIntoDataSourceCommandBuilder | ||
extends AbstractQueryPlanDatasetBuilder< | ||
SparkListenerEvent, SaveIntoDataSourceCommand, OpenLineage.OutputDataset> { | ||
|
||
public RedshiftSaveIntoDataSourceCommandBuilder(OpenLineageContext context) { | ||
super(context, false); | ||
} | ||
|
||
@Override | ||
public List<OpenLineage.OutputDataset> apply(SaveIntoDataSourceCommand command) { | ||
if (isRedshiftSource(command.dataSource())) { | ||
// Called from SaveIntoDataSourceCommandVisitor on Snowflake write operations. | ||
Map<String, String> options = ScalaConversionUtils.<String, String>fromMap(command.options()); | ||
log.info("Redshift SaveIntoDataSourceCommand options: {}", options); | ||
Optional<String> dbtable = Optional.ofNullable(options.get("dbtable")); | ||
Optional<String> query = Optional.ofNullable(options.get("query")); | ||
String url = options.get("url"); | ||
|
||
try { | ||
return | ||
// Similar to Kafka, Snowflake also has some special handling. So we use the method | ||
// below for extracting the dataset from Snowflake write operations. | ||
RedshiftDataset.getDatasets( | ||
outputDataset(), url, dbtable, query, getSchema(command) | ||
// command.schema() doesn't seem to contain the schema when tested with Azure | ||
// Snowflake, | ||
// so we use the helper to extract it from the logical plan. | ||
); | ||
} catch (URISyntaxException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} else { | ||
return Collections.emptyList(); | ||
} | ||
} | ||
|
||
public static boolean isRedshiftSource(CreatableRelationProvider provider) { | ||
return hasRedshiftClasses(); // && provider instanceof DefaultSource; | ||
} | ||
|
||
/** | ||
* Taken from {@link | ||
* io.openlineage.spark.agent.lifecycle.plan.SaveIntoDataSourceCommandVisitor#getSchema(org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand)} | ||
* | ||
* @param command | ||
* @return | ||
*/ | ||
private StructType getSchema(SaveIntoDataSourceCommand command) { | ||
StructType schema = command.schema(); | ||
if ((schema == null || schema.fields() == null || schema.fields().length == 0) | ||
&& command.query() != null | ||
&& command.query().output() != null) { | ||
// get schema from logical plan's output | ||
schema = PlanUtils.toStructType(ScalaConversionUtils.fromSeq(command.query().output())); | ||
} | ||
return schema; | ||
} | ||
} |
Oops, something went wrong.