Skip to content

Commit

Permalink
Merge branch 'master' into cc--display-source-description
Browse files Browse the repository at this point in the history
  • Loading branch information
anshbansal authored Jul 30, 2023
2 parents 1d00312 + 753b0f1 commit 1b5a6c5
Show file tree
Hide file tree
Showing 37 changed files with 579 additions and 36,464 deletions.
17 changes: 17 additions & 0 deletions .github/workflows/docker-unified.yml
Original file line number Diff line number Diff line change
Expand Up @@ -490,6 +490,9 @@ jobs:
if: ${{ needs.setup.outputs.publish != 'true' }}
with:
image: ${{ env.DATAHUB_UPGRADE_IMAGE }}:${{ needs.setup.outputs.unique_tag }}
- name: Disable datahub-actions
run: |
yq -i 'del(.services.datahub-actions)' docker/quickstart/docker-compose-without-neo4j.quickstart.yml
- name: run quickstart
env:
DATAHUB_TELEMETRY_ENABLED: false
Expand All @@ -501,6 +504,20 @@ jobs:
# we are doing this because gms takes time to get ready
# and we don't have a better readiness check when bootstrap is done
sleep 60s
- name: Disable ES Disk Threshold
run: |
curl -XPUT "http://localhost:9200/_cluster/settings" \
-H 'Content-Type: application/json' -d'{
"persistent": {
"cluster": {
"routing": {
"allocation.disk.threshold_enabled": false
}
}
}
}'
- name: Remove Source Code
run: find ./*/* ! -path "./metadata-ingestion*" ! -path "./smoke-test*" ! -path "./gradle*" -delete
- name: Smoke test
env:
RUN_QUICKSTART: false
Expand Down
2 changes: 2 additions & 0 deletions docker/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ task quickstart(type: Exec, dependsOn: ':metadata-ingestion:install') {

environment "DATAHUB_TELEMETRY_ENABLED", "false"
environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}"
environment "ACTIONS_VERSION", 'alpine3.17-slim'
environment "DATAHUB_ACTIONS_IMAGE", 'nginx'

def cmd = [
'source ../metadata-ingestion/venv/bin/activate && ',
Expand Down
2 changes: 1 addition & 1 deletion docker/docker-compose-with-cassandra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ services:
datahub-actions:
container_name: datahub-actions
hostname: actions
image: acryldata/datahub-actions:${ACTIONS_VERSION:-head}
image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head}
env_file: datahub-actions/env/docker.env
depends_on:
datahub-gms:
Expand Down
2 changes: 1 addition & 1 deletion docker/docker-compose-without-neo4j.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ services:
datahub-actions:
container_name: datahub-actions
hostname: actions
image: acryldata/datahub-actions:${ACTIONS_VERSION:-head}
image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head}
env_file: datahub-actions/env/docker.env
depends_on:
datahub-gms:
Expand Down
2 changes: 1 addition & 1 deletion docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ services:
datahub-actions:
container_name: datahub-actions
hostname: actions
image: acryldata/datahub-actions:${ACTIONS_VERSION:-head}
image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head}
env_file: datahub-actions/env/docker.env
depends_on:
datahub-gms:
Expand Down
2 changes: 1 addition & 1 deletion docker/quickstart/docker-compose-m1.quickstart.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ services:
- METADATA_CHANGE_LOG_VERSIONED_TOPIC_NAME=MetadataChangeLog_Versioned_v1
- SCHEMA_REGISTRY_URL=http://schema-registry:8081
hostname: actions
image: acryldata/datahub-actions:${ACTIONS_VERSION:-head}
image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head}
datahub-frontend-react:
container_name: datahub-frontend-react
depends_on:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ services:
- METADATA_CHANGE_LOG_VERSIONED_TOPIC_NAME=MetadataChangeLog_Versioned_v1
- SCHEMA_REGISTRY_URL=http://schema-registry:8081
hostname: actions
image: acryldata/datahub-actions:${ACTIONS_VERSION:-head}
image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head}
datahub-frontend-react:
container_name: datahub-frontend-react
depends_on:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ services:
- METADATA_CHANGE_LOG_VERSIONED_TOPIC_NAME=MetadataChangeLog_Versioned_v1
- SCHEMA_REGISTRY_URL=http://schema-registry:8081
hostname: actions
image: acryldata/datahub-actions:${ACTIONS_VERSION:-head}
image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head}
datahub-frontend-react:
container_name: datahub-frontend-react
depends_on:
Expand Down
2 changes: 1 addition & 1 deletion docker/quickstart/docker-compose.quickstart.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ services:
- METADATA_CHANGE_LOG_VERSIONED_TOPIC_NAME=MetadataChangeLog_Versioned_v1
- SCHEMA_REGISTRY_URL=http://schema-registry:8081
hostname: actions
image: acryldata/datahub-actions:${ACTIONS_VERSION:-head}
image: ${DATAHUB_ACTIONS_IMAGE:-acryldata/datahub-actions}:${ACTIONS_VERSION:-head}
datahub-frontend-react:
container_name: datahub-frontend-react
depends_on:
Expand Down
1 change: 1 addition & 0 deletions docs-website/sidebars.js
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,7 @@ module.exports = {
"docs/components",
"docs/architecture/metadata-ingestion",
"docs/architecture/metadata-serving",
"docs/architecture/docker-containers",
],
},
{
Expand Down
27 changes: 27 additions & 0 deletions docs/architecture/docker-containers.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
---
title: "Docker Container Architecture"
---

# Docker Container Architecture

When running DataHub via docker-compose. or helm, the following is a diagram of the containers involved
with running DataHub and their relationships with each other. The helm chart uses helm hooks to determine
the proper ordering of the components whereas docker-compose relies on a series of health checks.

```text
datahub-frontend-react datahub-actions
\ /
| datahub-upgrade (NoCodeDataMigration, helm only)
| /
datahub-gms (healthy)
|
datahub-upgrade (SystemUpdate completed)
/--------------------/ | \ \------------------------------------------------\
/ | \-------------------\ \
mysql-setup (completed) elasticsearch-setup (completed) kafka-setup (completed) (if apply) neo4j (healthy)
| | / \
| | / \
mysql (healthy) elasticsearch (healthy) broker (healthy) (if not internal) schema-registry (healthy)
|
zookeeper (healthy)
```
1 change: 1 addition & 0 deletions docs/cli.md
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ The environment variables listed below take precedence over the DataHub CLI conf
- `DATAHUB_DEBUG` (default `false`) - Set to `true` to enable debug logging for CLI. Can also be achieved through `--debug` option of the CLI.
- `DATAHUB_VERSION` (default `head`) - Set to a specific version to run quickstart with the particular version of docker images.
- `ACTIONS_VERSION` (default `head`) - Set to a specific version to run quickstart with that image tag of `datahub-actions` container.
- `DATAHUB_ACTIONS_IMAGE` (default `acryldata/datahub-actions`) - Set to `-slim` to run a slimmer actions container without pyspark/deequ features.

```shell
DATAHUB_SKIP_CONFIG=false
Expand Down
4 changes: 4 additions & 0 deletions metadata-ingestion/docs/sources/bigquery/bigquery_pre.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ If you have multiple projects in your BigQuery setup, the role should be granted
client_id: "123456678890"
```

##### Profiling Requirements

To profile BigQuery external tables backed by Google Drive document, you need to grant document's "Viewer" access to service account's email address (`client_email` in credentials json file). To find the Google Drive document linked to BigQuery table, open the BigQuery console, locate the needed table, select "Details" from the drop-down menu in the top-right corner and refer "Source" field . To share access of Google Drive document, open the document, click "Share" in the top-right corner, add the service account's email address that needs "Viewer" access. ![Google Drive Sharing Dialog](https://github.com/datahub-project/static-assets/raw/main/imgs/integrations/bigquery/google_drive_share.png)
### Lineage Computation Details
When `use_exported_bigquery_audit_metadata` is set to `true`, lineage information will be computed using exported bigquery logs. On how to setup exported bigquery audit logs, refer to the following [docs](https://cloud.google.com/bigquery/docs/reference/auditlogs#defining_a_bigquery_log_sink_using_gcloud) on BigQuery audit logs. Note that only protoPayloads with "type.googleapis.com/google.cloud.audit.BigQueryAuditMetadata" are supported by the current ingestion version. The `bigquery_audit_metadata_datasets` parameter will be used only if `use_exported_bigquery_audit_metadat` is set to `true`.
Expand Down
34 changes: 22 additions & 12 deletions metadata-ingestion/docs/sources/looker/lookml_pre.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,29 @@ To use LookML ingestion through the UI, or automate github checkout through the
In a nutshell, there are three steps:

1. Generate a private-public ssh key pair. This will typically generate two files, e.g. looker_datahub_deploy_key (this is the private key) and looker_datahub_deploy_key.pub (this is the public key)
![Image](https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/gitssh/ssh-key-generation.png)
![Image](https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/gitssh/ssh-key-generation.png)

2. Add the public key to your Looker git repo as a deploy key with read access (no need to provision write access). Follow the guide [here](https://docs.github.com/en/developers/overview/managing-deploy-keys#deploy-keys) for that.
![Image](https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/gitssh/git-deploy-key.png)
![Image](https://raw.githubusercontent.com/datahub-project/static-assets/main/imgs/gitssh/git-deploy-key.png)

3. Make note of the private key file, you will need to paste the contents of the file into the **GitHub Deploy Key** field later while setting up [ingestion using the UI](#ui-based-ingestion-recommended-for-ease-of-use).

### Setup your connection mapping

The connection mapping enables DataHub to accurately generate lineage to your upstream warehouse.
It maps Looker connection names to the platform and database that they're pointing to.

There's two ways to configure this:

1. Provide Looker **admin** API credentials, and we'll automatically map lineage correctly. Details on how to do this are below.
2. Manually populate the `connection_to_platform_map` and `project_name` configuration fields. See the starter recipe for an example of what this should look like.

#### [Optional] Create an API key with admin privileges

See the [Looker authentication docs](https://docs.looker.com/reference/api-and-integration/api-auth#authentication_with_an_sdk) for the steps to create a client ID and secret.
You need to ensure that the API key is attached to a user that has Admin privileges.

If that is not possible, read the configuration section and provide an offline specification of the `connection_to_platform_map` and the `project_name`.
If you don't want to provide admin API credentials, you can manually populate the `connection_to_platform_map` and `project_name` in the ingestion configuration.

### Ingestion Options

Expand Down Expand Up @@ -80,7 +90,6 @@ on:
release:
types: [published, edited]
workflow_dispatch:


jobs:
lookml-metadata-upload:
Expand All @@ -89,12 +98,13 @@ jobs:
- uses: actions/checkout@v3
- uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: "3.10"
- name: Run LookML ingestion
run: |
pip install 'acryl-datahub[lookml,datahub-rest]'
cat << EOF > lookml_ingestion.yml
# LookML ingestion configuration
# LookML ingestion configuration.
# This is a full ingestion recipe, and supports all config options that the LookML source supports.
source:
type: "lookml"
config:
Expand All @@ -106,22 +116,22 @@ jobs:
# Options
#connection_to_platform_map:
# connection-name:
#platform: platform-name (e.g. snowflake)
#default_db: default-db-name (e.g. DEMO_PIPELINE)
# platform: platform-name (e.g. snowflake)
# default_db: default-db-name (e.g. DEMO_PIPELINE)
api:
client_id: ${LOOKER_CLIENT_ID}
client_secret: ${LOOKER_CLIENT_SECRET}
base_url: ${LOOKER_BASE_URL}
sink:
type: datahub-rest
config:
server: ${DATAHUB_GMS_HOST}
token: ${DATAHUB_TOKEN}
server: ${DATAHUB_GMS_URL}
token: ${DATAHUB_GMS_TOKEN}
EOF
datahub ingest -c lookml_ingestion.yml
env:
DATAHUB_GMS_HOST: ${{ secrets.DATAHUB_GMS_HOST }}
DATAHUB_TOKEN: ${{ secrets.DATAHUB_TOKEN }}
DATAHUB_GMS_URL: ${{ secrets.DATAHUB_GMS_URL }}
DATAHUB_GMS_TOKEN: ${{ secrets.DATAHUB_GMS_TOKEN }}
LOOKER_BASE_URL: ${{ secrets.LOOKER_BASE_URL }}
LOOKER_CLIENT_ID: ${{ secrets.LOOKER_CLIENT_ID }}
LOOKER_CLIENT_SECRET: ${{ secrets.LOOKER_CLIENT_SECRET }}
Expand Down
9 changes: 4 additions & 5 deletions metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,11 @@ def get_long_description():
}

sqllineage_lib = {
"sqllineage==1.3.6",
"sqllineage==1.3.8",
# We don't have a direct dependency on sqlparse but it is a dependency of sqllineage.
# As per https://github.com/reata/sqllineage/issues/361
# and https://github.com/reata/sqllineage/pull/360
# sqllineage has compat issues with sqlparse 0.4.4.
"sqlparse==0.4.3",
# There have previously been issues from not pinning sqlparse, so it's best to pin it.
# Related: https://github.com/reata/sqllineage/issues/361 and https://github.com/reata/sqllineage/pull/360
"sqlparse==0.4.4",
}

sqlglot_lib = {
Expand Down
5 changes: 0 additions & 5 deletions metadata-ingestion/src/datahub/ingestion/graph/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@

from datahub.cli.cli_utils import get_url_and_token
from datahub.configuration.common import ConfigModel, GraphError, OperationalError
from datahub.configuration.validate_field_removal import pydantic_removed_field
from datahub.emitter.aspect import TIMESERIES_ASPECT_MAP
from datahub.emitter.mce_builder import DEFAULT_ENV, Aspect, make_data_platform_urn
from datahub.emitter.mcp import MetadataChangeProposalWrapper
Expand Down Expand Up @@ -65,10 +64,6 @@ class DatahubClientConfig(ConfigModel):
ca_certificate_path: Optional[str] = None
disable_ssl_verification: bool = False

_max_threads_moved_to_sink = pydantic_removed_field(
"max_threads", print_warning=False
)


# Alias for backwards compatibility.
# DEPRECATION: Remove in v0.10.2.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@ def create(
elif config_dict is None:
raise ConfigurationError("Missing provider configuration.")
else:
provider_config = DatahubIngestionStateProviderConfig.parse_obj(config_dict)
provider_config = (
DatahubIngestionStateProviderConfig.parse_obj_allow_extras(config_dict)
)
if provider_config.datahub_api:
graph = DataHubGraph(provider_config.datahub_api)
return cls(graph, name)
Expand Down
Loading

0 comments on commit 1b5a6c5

Please sign in to comment.