From 075d19ef166177ececfbb39796de4721bdde9dc1 Mon Sep 17 00:00:00 2001 From: Tamas Nemeth Date: Wed, 4 May 2022 21:57:26 +0200 Subject: [PATCH] fix(doc): improving docs across multiple sources (#4815) --- .../docs/sources/athena/athena_recipe.yml | 12 ++ .../docs/sources/pulsar/README.md | 38 +++++ .../docs/sources/pulsar/pulsar.md | 138 ------------------ .../docs/sources/pulsar/pulsar_recipe.yml | 23 +++ .../docs/sources/snowflake/README.md | 1 + .../docs/sources/snowflake/snowflake-usage.md | 26 ++++ .../snowflake/snowflake-usage_recipe.yml | 26 ++++ .../docs/sources/snowflake/snowflake.md | 56 +++++++ .../sources/snowflake/snowflake_recipe.yml | 53 +++++++ .../src/datahub/ingestion/source/aws/glue.py | 10 +- .../src/datahub/ingestion/source/pulsar.py | 13 ++ .../datahub/ingestion/source/sql/athena.py | 27 +++- .../datahub/ingestion/source/sql/redshift.py | 5 +- .../datahub/ingestion/source/sql/snowflake.py | 8 + .../source/usage/starburst_trino_usage.py | 4 - .../datahub/ingestion/source_config/pulsar.py | 85 +++++++---- .../ingestion/source_config/sql/bigquery.py | 59 ++++++-- .../ingestion/source_config/sql/snowflake.py | 93 +++++++++--- .../source_config/usage/bigquery_usage.py | 108 ++++++++++---- .../source_config/usage/snowflake_usage.py | 43 ++++-- 20 files changed, 581 insertions(+), 247 deletions(-) create mode 100644 metadata-ingestion/docs/sources/athena/athena_recipe.yml create mode 100644 metadata-ingestion/docs/sources/pulsar/README.md create mode 100644 metadata-ingestion/docs/sources/pulsar/pulsar_recipe.yml create mode 100644 metadata-ingestion/docs/sources/snowflake/README.md create mode 100644 metadata-ingestion/docs/sources/snowflake/snowflake-usage.md create mode 100644 metadata-ingestion/docs/sources/snowflake/snowflake-usage_recipe.yml create mode 100644 metadata-ingestion/docs/sources/snowflake/snowflake.md create mode 100644 metadata-ingestion/docs/sources/snowflake/snowflake_recipe.yml diff --git a/metadata-ingestion/docs/sources/athena/athena_recipe.yml b/metadata-ingestion/docs/sources/athena/athena_recipe.yml new file mode 100644 index 0000000000000..4d4fe2cef5a72 --- /dev/null +++ b/metadata-ingestion/docs/sources/athena/athena_recipe.yml @@ -0,0 +1,12 @@ +source: + type: athena + config: + # Coordinates + aws_region: my_aws_region + work_group: primary + + # Options + s3_staging_dir: "s3://my_staging_athena_results_bucket/results/" + +sink: +# sink configs diff --git a/metadata-ingestion/docs/sources/pulsar/README.md b/metadata-ingestion/docs/sources/pulsar/README.md new file mode 100644 index 0000000000000..c70117865222e --- /dev/null +++ b/metadata-ingestion/docs/sources/pulsar/README.md @@ -0,0 +1,38 @@ +## Integration Details + + + + +The Datahub Pulsar source plugin extracts `topic` and `schema` metadata from an Apache Pulsar instance and ingest the information into Datahub. The plugin uses the [Pulsar admin Rest API interface](https://pulsar.apache.org/admin-rest-api/#) to interact with the Pulsar instance. The following APIs are used in order to: +- [Get the list of existing tenants](https://pulsar.apache.org/admin-rest-api/#tag/tenants) +- [Get the list of namespaces associated with each tenant](https://pulsar.apache.org/admin-rest-api/#tag/namespaces) +- [Get the list of topics associated with each namespace](https://pulsar.apache.org/admin-rest-api/#tag/persistent-topic) + - persistent topics + - persistent partitioned topics + - non-persistent topics + - non-persistent partitioned topics +- [Get the latest schema associated with each topic](https://pulsar.apache.org/admin-rest-api/#tag/schemas) + +The data is extracted on `tenant` and `namespace` basis, topics with corresponding schema (if available) are ingested as [Dataset](docs/generated/metamodel/entities/dataset.md) into Datahub. Some additional values like `schema description`, `schema_version`, `schema_type` and `partitioned` are included as `DatasetProperties`. + + +### Concept Mapping + + + + +This ingestion source maps the following Source System Concepts to DataHub Concepts: + + + + +| Source Concept | DataHub Concept | Notes | +|----------------|--------------------------------------------------------------------|---------------------------------------------------------------------------| +| `pulsar` | [Data Platform](docs/generated/metamodel/entities/dataPlatform.md) | | +| Pulsar Topic | [Dataset](docs/generated/metamodel/entities/dataset.md) | _subType_: `topic` | +| Pulsar Schema | [SchemaField](docs/generated/metamodel/entities/schemaField.md) | Maps to the fields defined within the `Avro` or `JSON` schema definition. | + + +## Metadata Ingestion Quickstart + +For context on getting started with ingestion, check out our [metadata ingestion guide](../../../../metadata-ingestion/README.md). diff --git a/metadata-ingestion/docs/sources/pulsar/pulsar.md b/metadata-ingestion/docs/sources/pulsar/pulsar.md index 5149de376778e..30449f47398f0 100644 --- a/metadata-ingestion/docs/sources/pulsar/pulsar.md +++ b/metadata-ingestion/docs/sources/pulsar/pulsar.md @@ -1,69 +1,3 @@ -# Pulsar - - - -![Incubating](https://img.shields.io/badge/support%20status-incubating-blue) - -## Integration Details - - - - -The Datahub Pulsar source plugin extracts `topic` and `schema` metadata from an Apache Pulsar instance and ingest the information into Datahub. The plugin uses the [Pulsar admin Rest API interface](https://pulsar.apache.org/admin-rest-api/#) to interact with the Pulsar instance. The following APIs are used in order to: -- [Get the list of existing tenants](https://pulsar.apache.org/admin-rest-api/#tag/tenants) -- [Get the list of namespaces associated with each tenant](https://pulsar.apache.org/admin-rest-api/#tag/namespaces) -- [Get the list of topics associated with each namespace](https://pulsar.apache.org/admin-rest-api/#tag/persistent-topic) - - persistent topics - - persistent partitioned topics - - non-persistent topics - - non-persistent partitioned topics -- [Get the latest schema associated with each topic](https://pulsar.apache.org/admin-rest-api/#tag/schemas) - -The data is extracted on `tenant` and `namespace` basis, topics with corresponding schema (if available) are ingested as [Dataset](docs/generated/metamodel/entities/dataset.md) into Datahub. Some additional values like `schema description`, `schema_version`, `schema_type` and `partitioned` are included as `DatasetProperties`. - - -### Concept Mapping - - - - -This ingestion source maps the following Source System Concepts to DataHub Concepts: - - - - -| Source Concept | DataHub Concept | Notes | -|----------------|--------------------------------------------------------------------|---------------------------------------------------------------------------| -| `pulsar` | [Data Platform](docs/generated/metamodel/entities/dataPlatform.md) | | -| Pulsar Topic | [Dataset](docs/generated/metamodel/entities/dataset.md) | _subType_: `topic` | -| Pulsar Schema | [SchemaField](docs/generated/metamodel/entities/schemaField.md) | Maps to the fields defined within the `Avro` or `JSON` schema definition. | - - -### Supported Capabilities - - - - -| Capability | Status | Notes | -|-------------------------------------------------------|:------:|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| Data Container | ❌ | | -| [Stateful Ingestion](../../../../metadata-ingestion/docs/dev_guides/stateful.md) | ✅ | Requires recipe configuration, stateful Ingestion is available only when a Platform Instance is assigned to this source. | -| Partition Support | ✅ | Requires recipe configuration, each individual partition topic can be ingest. Behind the scenes, a partitioned topic is actually implemented as N internal topics, where N is the number of partitions. This feature is disabled by default. | -| [Platform Instance](../../../platform-instances.md) | ✅ | Requires recipe configuration and is mandatory for Stateful Ingestion. A Pulsar instance consists of one or more Pulsar clusters. | -| [Data Domain](../../../domains.md) | ✅ | Requires recipe configuration | -| Dataset Profiling | ❌ | | -| Dataset Usage | ❌ | | -| Extract Descriptions | ❌ | | -| Extract Lineage | ❌ | | -| Extract Ownership | ❌ | | -| Extract Tags | ❌ | | -| ... | | - -## Metadata Ingestion Quickstart - -For context on getting started with ingestion, check out our [metadata ingestion guide](../../../../metadata-ingestion/README.md). - ### Prerequisites In order to ingest metadata from Apache Pulsar, you will need: @@ -100,77 +34,5 @@ sink: # sink configs ``` - -#### Example recipe with authentication -An example recipe for ingesting from a Pulsar instance with oauth authentication and ssl enabled. - - -```yml -source: - type: "pulsar" - config: - env: "TEST" - platform_instance: "local" - ## Pulsar client connection config ## - web_service_url: "https://localhost:8443" - verify_ssl: "/opt/certs/ca.cert.pem" - # Issuer url for auth document, for example "http://localhost:8083/realms/pulsar" - issuer_url: - client_id: ${CLIENT_ID} - client_secret: ${CLIENT_SECRET} - # Tenant list to scrape - tenants: - - tenant_1 - - tenant_2 - # Topic filter pattern - topic_patterns: - allow: - - ".*sales.*" - -sink: - # sink configs -``` - > **_NOTE:_** Always use TLS encryption in a production environment and use variable substitution for sensitive information (e.g. ${CLIENT_ID} and ${CLIENT_SECRET}). > - -## Config details -
- View All Recipe Configuration Options - -Note that a `.` is used to denote nested fields in the YAML recipe. - -| Field | Required | Default | Description | -|---------------------------------|:--------:|-------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| -| `env` | ❌ | `PROD` | The data fabric, defaults to PROD | -| `platform_instance` | ❌ | | The Platform instance to use while constructing URNs. Mandatory for Stateful Ingestion | -| `web_service_url` | ✅ | `http://localhost:8080` | The web URL for the cluster. | -| `timeout` | ❌ | `5` | Timout setting, how long to wait for the Pulsar rest api to send data before giving up | -| `verify_ssl` | ❌ | `True` | Either a boolean, in which case it controls whether we verify the server's TLS certificate, or a string, in which case it must be a path to a CA bundle to use. | -| `issuer_url` | ❌ | | The complete URL for a Custom Authorization Server. Mandatory for OAuth based authentication. | -| `client_id` | ❌ | | The application's client ID | -| `client_secret` | ❌ | | The application's client secret | -| `token` | ❌ | | The access token for the application. Mandatory for token based authentication. | -| `tenant_patterns.allow` | ❌ | `.*` | List of regex patterns for tenants to include in ingestion. By default all tenants are allowed. | -| `tenant_patterns.deny` | ❌ | `pulsar` | List of regex patterns for tenants to exclude from ingestion. By default the Pulsar system tenant is denied. | -| `tenant_patterns.ignoreCase` | ❌ | `True` | Whether to ignore case sensitivity during tenant pattern matching. | -| `namespace_patterns.allow` | ❌ | `.*` | List of regex patterns for namespaces to include in ingestion. By default all namespaces are allowed. | -| `namespace_patterns.deny` | ❌ | `public/functions` | List of regex patterns for namespaces to exclude from ingestion. By default the functions namespace is denied. | -| `namespace_patterns.ignoreCase` | ❌ | `True` | Whether to ignore case sensitivity during namespace pattern matching. | -| `topic_patterns.allow` | ❌ | `.*` | List of regex patterns for topics to include in ingestion. By default all topics are allowed. | -| `topic_patterns.deny` | ❌ | `/__.*$` | List of regex patterns for topics to exclude from ingestion. By default the Pulsar system topics are denied. | -| `topic_patterns.ignoreCase` | ❌ | `True` | Whether to ignore case sensitivity during topic pattern matching. | -| `tenants` | ❌ | | Listing all tenants requires superUser role, alternative you can set a list of tenants you want to scrape using the tenant admin role | -| `exclude_individual_partitions` | ❌ | `True` | Extract each individual partitioned topic. e.g. when turned off a topic with 100 partitions will result in 100 `Datesets`. | -| `domain.domain_urn.allow` | ❌ | | List of regex patterns for topics to set domain_urn domain key. There can be multiple domain key specified. | -| `domain.domain_urn.deny` | ❌ | | List of regex patterns for topics to not assign domain_urn. There can be multiple domain key specified. | -| `domain.domain_urn.ignoreCase` | ❌ | `True` | Whether to ignore case sensitivity during pattern matching.There can be multiple domain key specified. | -| `stateful_ingestion` | ❌ | | see [Stateful Ingestion](../../../../metadata-ingestion/docs/dev_guides/stateful.md) | -
- - -## Troubleshooting - -### [Common Issue] - -[Provide description of common issues with this integration and steps to resolve] diff --git a/metadata-ingestion/docs/sources/pulsar/pulsar_recipe.yml b/metadata-ingestion/docs/sources/pulsar/pulsar_recipe.yml new file mode 100644 index 0000000000000..3538e7d417cd5 --- /dev/null +++ b/metadata-ingestion/docs/sources/pulsar/pulsar_recipe.yml @@ -0,0 +1,23 @@ +source: + type: "pulsar" + config: + env: "TEST" + platform_instance: "local" + ## Pulsar client connection config ## + web_service_url: "https://localhost:8443" + verify_ssl: "/opt/certs/ca.cert.pem" + # Issuer url for auth document, for example "http://localhost:8083/realms/pulsar" + issuer_url: + client_id: ${CLIENT_ID} + client_secret: ${CLIENT_SECRET} + # Tenant list to scrape + tenants: + - tenant_1 + - tenant_2 + # Topic filter pattern + topic_patterns: + allow: + - ".*sales.*" + +sink: +# sink configs diff --git a/metadata-ingestion/docs/sources/snowflake/README.md b/metadata-ingestion/docs/sources/snowflake/README.md new file mode 100644 index 0000000000000..62c6c15513557 --- /dev/null +++ b/metadata-ingestion/docs/sources/snowflake/README.md @@ -0,0 +1 @@ +To get all metadata from Snowflake you need to use two plugins `snowflake` and `snowflake-usage`. Both of them are described in this page. These will require 2 separate recipes. We understand this is not ideal and we plan to make this easier in the future. diff --git a/metadata-ingestion/docs/sources/snowflake/snowflake-usage.md b/metadata-ingestion/docs/sources/snowflake/snowflake-usage.md new file mode 100644 index 0000000000000..b684c83cb75ad --- /dev/null +++ b/metadata-ingestion/docs/sources/snowflake/snowflake-usage.md @@ -0,0 +1,26 @@ +### Prerequisites + +In order to execute the `snowflake-usage` source, your Snowflake user will need to have specific privileges granted to it. Specifically, you'll need to grant access to the [Account Usage](https://docs.snowflake.com/en/sql-reference/account-usage.html) system tables, using which the DataHub source extracts information. Assuming you've followed the steps outlined in `snowflake` plugin to create a DataHub-specific User & Role, you'll simply need to execute the following commands in Snowflake. This will require a user with the `ACCOUNTADMIN` role (or a role granted the IMPORT SHARES global privilege). Please see [Snowflake docs for more details](https://docs.snowflake.com/en/user-guide/data-share-consumers.html). + +```sql +grant imported privileges on database snowflake to role datahub_role; +``` + +### Capabilities + +This plugin extracts the following: + +- Statistics on queries issued and tables and columns accessed (excludes views) +- Aggregation of these statistics into buckets, by day or hour granularity + + +:::note + +This source only does usage statistics. To get the tables, views, and schemas in your Snowflake warehouse, ingest using the `snowflake` source described above. + +::: + +### Caveats +- Some of the features are only available in the Snowflake Enterprise Edition. This docs has notes mentioning where this applies. +- The underlying Snowflake views that we use to get metadata have a [latency of 45 minutes to 3 hours](https://docs.snowflake.com/en/sql-reference/account-usage.html#differences-between-account-usage-and-information-schema). So we would not be able to get very recent metadata in some cases like queries you ran within that time period etc.. +- If there is any [incident going on for Snowflake](https://status.snowflake.com/) we will not be able to get the metadata until that incident is resolved. diff --git a/metadata-ingestion/docs/sources/snowflake/snowflake-usage_recipe.yml b/metadata-ingestion/docs/sources/snowflake/snowflake-usage_recipe.yml new file mode 100644 index 0000000000000..6567a842a4def --- /dev/null +++ b/metadata-ingestion/docs/sources/snowflake/snowflake-usage_recipe.yml @@ -0,0 +1,26 @@ +source: + type: snowflake-usage + config: + # Coordinates + account_id: account_name + warehouse: "COMPUTE_WH" + + # Credentials + username: "${SNOWFLAKE_USER}" + password: "${SNOWFLAKE_PASS}" + role: "datahub_role" + + # Options + top_n_queries: 10 + email_domain: mycompany.com + + database_pattern: + allow: + - "^ACCOUNTING_DB$" + - "^MARKETING_DB$" + schema_pattern: + deny: + - "information_schema.*" + +sink: +# sink configs diff --git a/metadata-ingestion/docs/sources/snowflake/snowflake.md b/metadata-ingestion/docs/sources/snowflake/snowflake.md new file mode 100644 index 0000000000000..a2190b7c1ba50 --- /dev/null +++ b/metadata-ingestion/docs/sources/snowflake/snowflake.md @@ -0,0 +1,56 @@ +### Prerequisites + +In order to execute this source, your Snowflake user will need to have specific privileges granted to it for reading metadata +from your warehouse. + +You can use the `provision_role` block in the recipe to grant the requires roles. + +If your system admins prefer running the commands themselves then they can follow this guide to create a DataHub-specific role, assign it the required privileges, and assign it to a new DataHub user by executing the following Snowflake commands from a user with the `ACCOUNTADMIN` role or `MANAGE GRANTS` privilege. + +```sql +create or replace role datahub_role; + +// Grant access to a warehouse to run queries to view metadata +grant operate, usage on warehouse "" to role datahub_role; + +// Grant access to view database and schema in which your tables/views exist +grant usage on DATABASE "" to role datahub_role; +grant usage on all schemas in database "" to role datahub_role; +grant usage on future schemas in database "" to role datahub_role; + +// If you are NOT using Snowflake Profiling feature: Grant references privileges to your tables and views +grant references on all tables in database "" to role datahub_role; +grant references on future tables in database "" to role datahub_role; +grant references on all external tables in database "" to role datahub_role; +grant references on future external tables in database "" to role datahub_role; +grant references on all views in database "" to role datahub_role; +grant references on future views in database "" to role datahub_role; + +// If you ARE using Snowflake Profiling feature: Grant select privileges to your tables and views +grant select on all tables in database "" to role datahub_role; +grant select on future tables in database "" to role datahub_role; +grant select on all external tables in database "" to role datahub_role; +grant select on future external tables in database "" to role datahub_role; +grant select on all views in database "" to role datahub_role; +grant select on future views in database "" to role datahub_role; + +// Create a new DataHub user and assign the DataHub role to it +create user datahub_user display_name = 'DataHub' password='' default_role = datahub_role default_warehouse = ''; + +// Grant the datahub_role to the new DataHub user. +grant role datahub_role to user datahub_user; +``` + +The details of each granted privilege can be viewed in [snowflake docs](https://docs.snowflake.com/en/user-guide/security-access-control-privileges.html). A summarization of each privilege, and why it is required for this connector: +- `operate` is required on warehouse to execute queries +- `usage` is required for us to run queries using the warehouse +- `usage` on `database` and `schema` are required because without it tables and views inside them are not accessible. If an admin does the required grants on `table` but misses the grants on `schema` or the `database` in which the table/view exists then we will not be able to get metadata for the table/view. +- If metadata is required only on some schemas then you can grant the usage privilieges only on a particular schema like +```sql +grant usage on schema ""."" to role datahub_role; +``` +- To get the lineage and usage data we need access to the default `snowflake` database + +This represents the bare minimum privileges required to extract databases, schemas, views, tables from Snowflake. + +If you plan to enable extraction of table lineage, via the `include_table_lineage` config flag, you'll need to grant additional privileges. See [snowflake usage prerequisites](#prerequisites-1) as the same privilege is required for this purpose too. diff --git a/metadata-ingestion/docs/sources/snowflake/snowflake_recipe.yml b/metadata-ingestion/docs/sources/snowflake/snowflake_recipe.yml new file mode 100644 index 0000000000000..93279fa26650f --- /dev/null +++ b/metadata-ingestion/docs/sources/snowflake/snowflake_recipe.yml @@ -0,0 +1,53 @@ +source: + type: snowflake + config: + + check_role_grants: True + provision_role: # Optional + enabled: false + dry_run: true + run_ingestion: false + admin_username: "${SNOWFLAKE_ADMIN_USER}" + admin_password: "${SNOWFLAKE_ADMIN_PASS}" + + # This option is recommended to be used for the first time to ingest all lineage + ignore_start_time_lineage: true + # This is an alternative option to specify the start_time for lineage + # if you don't want to look back since beginning + start_time: '2022-03-01T00:00:00Z' + + # Coordinates + account_id: "abc48144" + warehouse: "COMPUTE_WH" + + # Credentials + username: "${SNOWFLAKE_USER}" + password: "${SNOWFLAKE_PASS}" + role: "datahub_role" + + # Change these as per your database names. Remove to all all databases + database_pattern: + allow: + - "^ACCOUNTING_DB$" + - "^MARKETING_DB$" + schema_pattern: + deny: + - "information_schema.*" + table_pattern: + allow: + # If you want to ingest only few tables with name revenue and sales + - ".*revenue" + - ".*sales" + + profiling: + # Change to false to disable profiling + enabled: true + profile_pattern: + allow: + - 'ACCOUNTING_DB.*.*' + - 'MARKETING_DB.*.*' + deny: + - '.*information_schema.*' + +sink: +# sink configs \ No newline at end of file diff --git a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py index 3aeeebd26e76b..32b0888a98c9d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py +++ b/metadata-ingestion/src/datahub/ingestion/source/aws/glue.py @@ -110,8 +110,14 @@ class GlueSourceConfig(AwsSourceConfig, PlatformSourceConfigBase): description="The aws account id where the target glue catalog lives. If None, datahub will ingest glue in aws caller's account.", ) - use_s3_bucket_tags: Optional[bool] = False - use_s3_object_tags: Optional[bool] = False + use_s3_bucket_tags: Optional[bool] = Field( + default=False, + description="If an S3 Buckets Tags should be created for the Tables ingested by Glue. Please Note that this will not apply tags to any folders ingested, only the files.", + ) + use_s3_object_tags: Optional[bool] = Field( + default=False, + description="If an S3 Objects Tags should be created for the Tables ingested by Glue.", + ) @property def glue_client(self): diff --git a/metadata-ingestion/src/datahub/ingestion/source/pulsar.py b/metadata-ingestion/src/datahub/ingestion/source/pulsar.py index 2e87b7a516272..e4d9a505ea721 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/pulsar.py +++ b/metadata-ingestion/src/datahub/ingestion/source/pulsar.py @@ -17,6 +17,14 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.emitter.mcp_builder import add_domain_to_entity_wu from datahub.ingestion.api.common import PipelineContext +from datahub.ingestion.api.decorators import ( + SourceCapability, + SupportStatus, + capability, + config_class, + platform_name, + support_status, +) from datahub.ingestion.api.workunit import MetadataWorkUnit from datahub.ingestion.extractor import schema_util from datahub.ingestion.source.state.checkpoint import Checkpoint @@ -78,6 +86,11 @@ def __init__(self, schema): self.properties = schema.get("properties") +@platform_name("Pulsar") +@support_status(SupportStatus.INCUBATING) +@config_class(PulsarSourceConfig) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") @dataclass class PulsarSource(StatefulIngestionSourceBase): def __init__(self, config: PulsarSourceConfig, ctx: PipelineContext): diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py index b89a078233508..299ce9f5247a1 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/athena.py @@ -10,7 +10,9 @@ from datahub.emitter.mcp_builder import DatabaseKey, gen_containers from datahub.ingestion.api.decorators import ( + SourceCapability, SupportStatus, + capability, config_class, platform_name, support_status, @@ -33,10 +35,19 @@ class AthenaConfig(SQLAlchemyConfig): password: Optional[str] = pydantic.Field( default=None, description="Same detection scheme as username" ) - database: Optional[str] = None - aws_region: str - s3_staging_dir: str - work_group: str + database: Optional[str] = pydantic.Field( + default=None, + description="The athena database to ingest from. If not set it will be autodetected", + ) + aws_region: str = pydantic.Field( + description="Aws region where your Athena database is located" + ) + s3_staging_dir: str = pydantic.Field( + description="Staging s3 location where the Athena query results will be stored" + ) + work_group: str = pydantic.Field( + description="The name of your Amazon Athena Workgroups" + ) include_views = False # not supported for Athena @@ -57,6 +68,14 @@ def get_sql_alchemy_url(self): @platform_name("Athena") @support_status(SupportStatus.CERTIFIED) @config_class(AthenaConfig) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") +@capability( + SourceCapability.DATA_PROFILING, + "Optionally enabled via configuration. Profiling uses sql queries on whole table which can be expensive operation.", +) +@capability(SourceCapability.DESCRIPTIONS, "Enabled by default") +@capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration") class AthenaSource(SQLAlchemySource): """ This plugin supports extracting the following metadata from Athena diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py b/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py index b6b1cd4d0d417..da4db1eb6b1dd 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/redshift.py @@ -131,12 +131,13 @@ class RedshiftConfig( description="Whether lineage should be collected from copy commands", ) capture_lineage_query_parser_failures: Optional[bool] = Field( - default=False, description="" + default=False, + description="Whether to capture lineage query parser errors with dataset properties for debuggings", ) table_lineage_mode: Optional[LineageMode] = Field( default=LineageMode.STL_SCAN_BASED, - description="Which table lineage collector mode to use", + description="Which table lineage collector mode to use. Available modes are: [stl_scan_based, sql_based, mixed]", ) @pydantic.validator("platform") diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source/sql/snowflake.py index 1ee654b0bc99d..cd6e5d8235c2d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/snowflake.py @@ -17,7 +17,9 @@ from datahub.emitter.mcp import MetadataChangeProposalWrapper from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( + SourceCapability, SupportStatus, + capability, config_class, platform_name, support_status, @@ -55,6 +57,12 @@ @platform_name("Snowflake") @config_class(SnowflakeConfig) @support_status(SupportStatus.CERTIFIED) +@capability(SourceCapability.PLATFORM_INSTANCE, "Enabled by default") +@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") +@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") +@capability(SourceCapability.DESCRIPTIONS, "Enabled by default") +@capability(SourceCapability.LINEAGE_COARSE, "Optionally enabled via configuration") +@capability(SourceCapability.DELETION_DETECTION, "Enabled via stateful ingestion") class SnowflakeSource(SQLAlchemySource): def __init__(self, config: SnowflakeConfig, ctx: PipelineContext): super().__init__(config, ctx, "snowflake") diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py index a2d7bcc320b67..f4618240e9ce0 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py @@ -15,9 +15,7 @@ import datahub.emitter.mce_builder as builder from datahub.configuration.time_window_config import get_time_bucket from datahub.ingestion.api.decorators import ( - SourceCapability, SupportStatus, - capability, config_class, platform_name, support_status, @@ -107,8 +105,6 @@ def get_sql_alchemy_url(self): @platform_name("Trino") @config_class(TrinoUsageConfig) @support_status(SupportStatus.CERTIFIED) -@capability(SourceCapability.DOMAINS, "Supported via the `domain` config field") -@capability(SourceCapability.DATA_PROFILING, "Optionally enabled via configuration") @dataclasses.dataclass class TrinoUsageSource(Source): """ diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/pulsar.py b/metadata-ingestion/src/datahub/ingestion/source_config/pulsar.py index dc3276440af22..e21c6fc3ea42b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/pulsar.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/pulsar.py @@ -39,37 +39,68 @@ def _is_valid_hostname(hostname: str) -> bool: class PulsarSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigBase): env: str = DEFAULT_ENV - # The web URL for the cluster. - web_service_url: str = "http://localhost:8080" - # Timout setting, how long to wait for the Pulsar rest api to send data before giving up - timeout: int = 5 + + web_service_url: str = Field( + default="http://localhost:8080", description="The web URL for the cluster." + ) + timeout: int = Field( + default=5, + description="Timout setting, how long to wait for the Pulsar rest api to send data before giving up", + ) # Mandatory for oauth authentication - issuer_url: Optional[str] = None - client_id: Optional[str] = None - client_secret: Optional[str] = None + issuer_url: Optional[str] = Field( + default=None, + description="The complete URL for a Custom Authorization Server. Mandatory for OAuth based authentication.", + ) + client_id: Optional[str] = Field( + default=None, description="The application's client ID" + ) + client_secret: Optional[str] = Field( + default=None, description="The application's client secret" + ) # Mandatory for token authentication - token: Optional[str] = None - # Either a boolean, in which case it controls whether we verify the server's TLS certificate, or a string, - # in which case it must be a path to a CA bundle to use. - verify_ssl: Union[bool, str] = True + token: Optional[str] = Field( + default=None, + description="The access token for the application. Mandatory for token based authentication.", + ) + verify_ssl: Union[bool, str] = Field( + default=True, + description="Either a boolean, in which case it controls whether we verify the server's TLS certificate, or a string, in which case it must be a path to a CA bundle to use.", + ) # By default, allow all topics and deny the pulsar system topics - tenant_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["pulsar"]) - namespace_patterns: AllowDenyPattern = AllowDenyPattern( - allow=[".*"], deny=["public/functions"] + tenant_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern(allow=[".*"], deny=["pulsar"]), + description="List of regex patterns for tenants to include/exclude from ingestion. By default all tenants are allowed.", + ) + namespace_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern(allow=[".*"], deny=["public/functions"]), + description="List of regex patterns for namespaces to include/exclude from ingestion. By default the functions namespace is denied.", + ) + topic_patterns: AllowDenyPattern = Field( + default=AllowDenyPattern(allow=[".*"], deny=["/__.*$"]), + description="List of regex patterns for topics to include/exclude from ingestion. By default the Pulsar system topics are denied.", + ) + exclude_individual_partitions: bool = Field( + default=True, + description="Extract each individual partitioned topic. e.g. when turned off a topic with 100 partitions will result in 100 Datesets.", + ) + + tenants: List[str] = Field( + default=[], + description="Listing all tenants requires superUser role, alternative you can set a list of tenants you want to scrape using the tenant admin role", + ) + + domain: Dict[str, AllowDenyPattern] = Field( + default_factory=dict, description="Domain patterns" + ) + + stateful_ingestion: Optional[PulsarSourceStatefulIngestionConfig] = Field( + default=None, description="see Stateful Ingestion" + ) + + oid_config: dict = Field( + default_factory=dict, description="Placeholder for OpenId discovery document" ) - topic_patterns: AllowDenyPattern = AllowDenyPattern(allow=[".*"], deny=["/__.*$"]) - # Exclude partition topics. e.g. topics ending on _partition_N where N is a number - exclude_individual_partitions: bool = True - # Listing all tenants requires superUser role, alternative you can set tenants you want to scrape - # using the tenant admin role - tenants: List[str] = [] - - domain: Dict[str, AllowDenyPattern] = dict() - # Custom Stateful Ingestion settings - stateful_ingestion: Optional[PulsarSourceStatefulIngestionConfig] = None - - # Placeholder for OpenId discovery document - oid_config: dict = Field(default_factory=dict) @validator("token") def ensure_only_issuer_or_token( diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/bigquery.py index 143380ac19938..1929f99021985 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/bigquery.py @@ -15,21 +15,54 @@ class BigQueryConfig(BaseTimeWindowConfig, SQLAlchemyConfig): scheme: str = "bigquery" - project_id: Optional[str] = None - lineage_client_project_id: Optional[str] = None - log_page_size: pydantic.PositiveInt = 1000 - credential: Optional[BigQueryCredential] + project_id: Optional[str] = pydantic.Field( + default=None, + description="Project ID to ingest from. If not specified, will infer from environment.", + ) + lineage_client_project_id: Optional[str] = pydantic.Field( + default=None, + description="If you want to use a different ProjectId for the lineage collection you can set it here.", + ) + log_page_size: pydantic.PositiveInt = pydantic.Field( + default=1000, + description="The number of log item will be queried per page for lineage collection", + ) + credential: Optional[BigQueryCredential] = pydantic.Field( + description="BigQuery credential informations" + ) # extra_client_options, include_table_lineage and max_query_duration are relevant only when computing the lineage. - extra_client_options: Dict[str, Any] = {} - include_table_lineage: Optional[bool] = True - max_query_duration: timedelta = timedelta(minutes=15) - credentials_path: Optional[str] = None - bigquery_audit_metadata_datasets: Optional[List[str]] = None - use_exported_bigquery_audit_metadata: bool = False - use_date_sharded_audit_log_tables: bool = False + extra_client_options: Dict[str, Any] = pydantic.Field( + default={}, + description="Additional options to pass to google.cloud.logging_v2.client.Client.", + ) + include_table_lineage: Optional[bool] = pydantic.Field( + default=True, + description="Option to enable/disable lineage generation. Is enabled by default.", + ) + max_query_duration: timedelta = pydantic.Field( + default=timedelta(minutes=15), + description="Correction to pad start_time and end_time with. For handling the case where the read happens within our time range but the query completion event is delayed and happens after the configured end time.", + ) + bigquery_audit_metadata_datasets: Optional[List[str]] = pydantic.Field( + default=None, + description="A list of datasets that contain a table named cloudaudit_googleapis_com_data_access which contain BigQuery audit logs, specifically, those containing BigQueryAuditMetadata. It is recommended that the project of the dataset is also specified, for example, projectA.datasetB.", + ) + use_exported_bigquery_audit_metadata: bool = pydantic.Field( + default=False, + description="When configured, use BigQueryAuditMetadata in bigquery_audit_metadata_datasets to compute lineage information.", + ) + use_date_sharded_audit_log_tables: bool = pydantic.Field( + default=False, + description="Whether to read date sharded tables or time partitioned tables when extracting usage from exported audit logs.", + ) _credentials_path: Optional[str] = pydantic.PrivateAttr(None) - use_v2_audit_metadata: Optional[bool] = False - upstream_lineage_in_report: bool = False + use_v2_audit_metadata: Optional[bool] = pydantic.Field( + default=False, description="Whether to ingest logs using the v2 format." + ) + upstream_lineage_in_report: bool = pydantic.Field( + default=False, + description="Useful for debugging lineage information. Set to True to see the raw lineage created internally.", + ) def __init__(self, **data: Any): super().__init__(**data) diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py index 4bae726ce6a22..f9b48b5467dec 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/sql/snowflake.py @@ -38,27 +38,49 @@ class SnowflakeProvisionRoleConfig(ConfigModel): - enabled: bool = False + enabled: bool = pydantic.Field( + default=False, + description="Whether provisioning of Snowflake role (used for ingestion) is enabled or not.", + ) # Can be used by account admin to test what sql statements will be run - dry_run: bool = False + dry_run: bool = pydantic.Field( + default=False, + description="If provision_role is enabled, whether to dry run the sql commands for system admins to see what sql grant commands would be run without actually running the grant commands.", + ) # Setting this to True is helpful in case you want a clean role without any extra privileges # Not set to True by default because multiple parallel # snowflake ingestions can be dependent on single role - drop_role_if_exists: bool = False + drop_role_if_exists: bool = pydantic.Field( + default=False, + description="Useful during testing to ensure you have a clean slate role. Not recommended for production use cases.", + ) # When Account admin is testing they might not want to actually do the ingestion # Set this to False in case the account admin would want to # create role # grant role to user in main config # run ingestion as the user in main config - run_ingestion: bool = False + run_ingestion: bool = pydantic.Field( + default=False, + description="If system admins wish to skip actual ingestion of metadata during testing of the provisioning of role.", + ) - admin_role: Optional[str] = "accountadmin" + admin_role: Optional[str] = pydantic.Field( + default="accountadmin", + description="The Snowflake role of admin user used for provisioning of the role specified by role config. System admins can audit the open source code and decide to use a different role.", + ) - admin_username: str - admin_password: pydantic.SecretStr = pydantic.Field(default=None, exclude=True) + admin_username: str = pydantic.Field( + description="The username to be used for provisioning of role." + ) + + admin_password: pydantic.SecretStr = pydantic.Field( + default=None, + exclude=True, + description="The password to be used for provisioning of role.", + ) @pydantic.validator("admin_username", always=True) def username_not_empty(cls, v, values, **kwargs): @@ -72,21 +94,50 @@ class BaseSnowflakeConfig(BaseTimeWindowConfig): # Note: this config model is also used by the snowflake-usage source. scheme: str = "snowflake" - username: Optional[str] = None - password: Optional[pydantic.SecretStr] = pydantic.Field(default=None, exclude=True) - private_key_path: Optional[str] + username: Optional[str] = pydantic.Field( + default=None, description="Snowflake username." + ) + password: Optional[pydantic.SecretStr] = pydantic.Field( + default=None, exclude=True, description="Snowflake password." + ) + private_key_path: Optional[str] = pydantic.Field( + default=None, + description="The path to the private key if using key pair authentication. See: https://docs.snowflake.com/en/user-guide/key-pair-auth.html", + ) private_key_password: Optional[pydantic.SecretStr] = pydantic.Field( - default=None, exclude=True - ) - authentication_type: str = "DEFAULT_AUTHENTICATOR" - host_port: Optional[str] # Deprecated - account_id: Optional[str] # Once host_port is removed this will be made mandatory - warehouse: Optional[str] - role: Optional[str] - include_table_lineage: bool = True - include_view_lineage: bool = True - connect_args: Optional[Dict] = pydantic.Field(default=None, exclude=True) - check_role_grants: bool = False + default=None, + exclude=True, + description="Password for your private key if using key pair authentication.", + ) + authentication_type: str = pydantic.Field( + default="DEFAULT_AUTHENTICATOR", + description='The type of authenticator to use when connecting to Snowflake. Supports "DEFAULT_AUTHENTICATOR", "EXTERNAL_BROWSER_AUTHENTICATOR" and "KEY_PAIR_AUTHENTICATOR".', + ) + host_port: Optional[str] = pydantic.Field( + description="DEPRECATED: Snowflake account. e.g. abc48144" + ) # Deprecated + account_id: Optional[str] = pydantic.Field( + description="Snowflake account. e.g. abc48144" + ) # Once host_port is removed this will be made mandatory + warehouse: Optional[str] = pydantic.Field(description="Snowflake warehouse.") + role: Optional[str] = pydantic.Field(description="Snowflake role.") + include_table_lineage: bool = pydantic.Field( + default=True, + description="If enabled, populates the snowflake table-to-table and s3-to-snowflake table lineage. Requires appropriate grants given to the role.", + ) + include_view_lineage: bool = pydantic.Field( + default=True, + description="If enabled, populates the snowflake view->table and table->view lineages (no view->view lineage yet). Requires appropriate grants given to the role, and include_table_lineage to be True.", + ) + connect_args: Optional[Dict] = pydantic.Field( + default=None, + description="Connect args to pass to Snowflake SqlAlchemy driver", + exclude=True, + ) + check_role_grants: bool = pydantic.Field( + default=False, + description="If set to True then checks role grants at the beginning of the ingestion run. To be used for debugging purposes. If you think everything is working fine then set it to False. In some cases this can take long depending on how many roles you might have.", + ) def get_account(self) -> str: assert self.account_id diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py b/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py index f4346803d15b3..8ff61674e266d 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/usage/bigquery_usage.py @@ -16,16 +16,31 @@ class BigQueryCredential(ConfigModel): - project_id: str - private_key_id: str - private_key: str - client_email: str - client_id: str - auth_uri: str = "https://accounts.google.com/o/oauth2/auth" - token_uri: str = "https://oauth2.googleapis.com/token" - auth_provider_x509_cert_url: str = "https://www.googleapis.com/oauth2/v1/certs" - type: str = "service_account" - client_x509_cert_url: Optional[str] + project_id: str = pydantic.Field(description="Project id to set the credentials") + private_key_id: str = pydantic.Field(description="Private key id") + private_key: str = pydantic.Field( + description="Private key in a form of '-----BEGIN PRIVATE KEY-----\nprivate-key\n-----END PRIVATE KEY-----\n'" + ) + client_email: str = pydantic.Field(description="Client email") + client_id: str = pydantic.Field(description="Client Id") + auth_uri: str = pydantic.Field( + default="https://accounts.google.com/o/oauth2/auth", + description="Authentication uri", + ) + token_uri: str = pydantic.Field( + default="https://oauth2.googleapis.com/token", description="Token uri" + ) + auth_provider_x509_cert_url: str = pydantic.Field( + default="https://www.googleapis.com/oauth2/v1/certs", + description="Auth provider x509 certificate url", + ) + type: str = pydantic.Field( + default="service_account", description="Authentication type" + ) + client_x509_cert_url: Optional[str] = pydantic.Field( + default=None, + description="If not set it will be default to https://www.googleapis.com/robot/v1/metadata/x509/client_email", + ) @pydantic.root_validator() def validate_config(cls, values: Dict[str, Any]) -> Dict[str, Any]: @@ -43,22 +58,63 @@ def create_credential_temp_file(self) -> str: class BigQueryUsageConfig(DatasetSourceConfigBase, BaseUsageConfig): - projects: Optional[List[str]] = None - project_id: Optional[str] = None # deprecated in favor of `projects` - extra_client_options: dict = {} - use_v2_audit_metadata: Optional[bool] = False - - bigquery_audit_metadata_datasets: Optional[List[str]] = None - use_exported_bigquery_audit_metadata: bool = False - use_date_sharded_audit_log_tables: bool = False - - table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - dataset_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - log_page_size: pydantic.PositiveInt = 1000 - - query_log_delay: Optional[pydantic.PositiveInt] = None - max_query_duration: timedelta = timedelta(minutes=15) - credential: Optional[BigQueryCredential] + projects: Optional[List[str]] = pydantic.Field( + default=None, + description="List of project ids to ingest usage from. If not specified, will infer from environment.", + ) + project_id: Optional[str] = pydantic.Field( + default=None, + description="Project ID to ingest usage from. If not specified, will infer from environment. Deprecated in favour of projects ", + ) + extra_client_options: dict = pydantic.Field( + default_factory=dict, + description="Additional options to pass to google.cloud.logging_v2.client.Client.", + ) + use_v2_audit_metadata: Optional[bool] = pydantic.Field( + default=False, + description="Whether to ingest logs using the v2 format. Required if use_exported_bigquery_audit_metadata is set to True.", + ) + + bigquery_audit_metadata_datasets: Optional[List[str]] = pydantic.Field( + description="A list of datasets that contain a table named cloudaudit_googleapis_com_data_access which contain BigQuery audit logs, specifically, those containing BigQueryAuditMetadata. It is recommended that the project of the dataset is also specified, for example, projectA.datasetB.", + ) + use_exported_bigquery_audit_metadata: bool = pydantic.Field( + default=False, + description="When configured, use BigQueryAuditMetadata in bigquery_audit_metadata_datasets to compute usage information.", + ) + + use_date_sharded_audit_log_tables: bool = pydantic.Field( + default=False, + description="Whether to read date sharded tables or time partitioned tables when extracting usage from exported audit logs.", + ) + + table_pattern: AllowDenyPattern = pydantic.Field( + default=AllowDenyPattern.allow_all(), + description="List of regex patterns for tables to include/exclude from ingestion.", + ) + dataset_pattern: AllowDenyPattern = pydantic.Field( + default=AllowDenyPattern.allow_all(), + description="List of regex patterns for datasets to include/exclude from ingestion.", + ) + log_page_size: pydantic.PositiveInt = pydantic.Field( + default=1000, + description="", + ) + + query_log_delay: Optional[pydantic.PositiveInt] = pydantic.Field( + default=None, + description="To account for the possibility that the query event arrives after the read event in the audit logs, we wait for at least query_log_delay additional events to be processed before attempting to resolve BigQuery job information from the logs. If query_log_delay is None, it gets treated as an unlimited delay, which prioritizes correctness at the expense of memory usage.", + ) + + max_query_duration: timedelta = pydantic.Field( + default=timedelta(minutes=15), + description="Correction to pad start_time and end_time with. For handling the case where the read happens within our time range but the query completion event is delayed and happens after the configured end time.", + ) + + credential: Optional[BigQueryCredential] = pydantic.Field( + default=None, + description="Bigquery credential. Required if GOOGLE_APPLICATION_CREDENTIALS enviroment variable is not set. See this example recipe for details", + ) _credentials_path: Optional[str] = pydantic.PrivateAttr(None) def __init__(self, **data: Any): diff --git a/metadata-ingestion/src/datahub/ingestion/source_config/usage/snowflake_usage.py b/metadata-ingestion/src/datahub/ingestion/source_config/usage/snowflake_usage.py index a4b09d624b69e..8f74e425da862 100644 --- a/metadata-ingestion/src/datahub/ingestion/source_config/usage/snowflake_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source_config/usage/snowflake_usage.py @@ -27,16 +27,39 @@ class SnowflakeStatefulIngestionConfig(StatefulIngestionConfig): class SnowflakeUsageConfig( BaseSnowflakeConfig, BaseUsageConfig, StatefulIngestionConfigBase ): - options: dict = {} - database_pattern: AllowDenyPattern = AllowDenyPattern( - deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"] - ) - email_domain: Optional[str] - schema_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - table_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() - apply_view_usage_to_tables: bool = False - stateful_ingestion: Optional[SnowflakeStatefulIngestionConfig] = None + options: dict = pydantic.Field( + default_factory=dict, + description="Any options specified here will be passed to SQLAlchemy's create_engine as kwargs. See https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine for details.", + ) + + database_pattern: AllowDenyPattern = pydantic.Field( + default=AllowDenyPattern( + deny=[r"^UTIL_DB$", r"^SNOWFLAKE$", r"^SNOWFLAKE_SAMPLE_DATA$"] + ), + description="List of regex patterns for databases to include/exclude in usage ingestion.", + ) + email_domain: Optional[str] = pydantic.Field( + description="Email domain of your organisation so users can be displayed on UI appropriately." + ) + schema_pattern: AllowDenyPattern = pydantic.Field( + default=AllowDenyPattern.allow_all(), + description="List of regex patterns for schemas to include/exclude in usage ingestion.", + ) + table_pattern: AllowDenyPattern = pydantic.Field( + default=AllowDenyPattern.allow_all(), + description="List of regex patterns for tables to include in ingestion.", + ) + view_pattern: AllowDenyPattern = pydantic.Field( + default=AllowDenyPattern.allow_all(), + description="List of regex patterns for views to include in ingestion.", + ) + apply_view_usage_to_tables: bool = pydantic.Field( + default=False, + description="Allow/deny patterns for views in snowflake dataset names.", + ) + stateful_ingestion: Optional[SnowflakeStatefulIngestionConfig] = pydantic.Field( + default=None, description="Stateful ingestion related configs" + ) def get_options(self) -> dict: options_connect_args: Dict = super().get_sql_alchemy_connect_args()