Skip to content

Commit

Permalink
Merge branch 'master' into tableau-site-url
Browse files Browse the repository at this point in the history
  • Loading branch information
hsheth2 authored Jul 31, 2024
2 parents 9c74076 + 89933fe commit ddc66f1
Show file tree
Hide file tree
Showing 14 changed files with 102 additions and 73 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/airflow-plugin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ jobs:
- name: pip freeze show list installed
if: always()
run: source metadata-ingestion-modules/airflow-plugin/venv/bin/activate && pip freeze
- uses: actions/upload-artifact@v4
- uses: actions/upload-artifact@v3
if: ${{ always() && matrix.python-version == '3.10' && matrix.extra_pip_requirements == 'apache-airflow>=2.7.0' }}
with:
name: Test Results (Airflow Plugin ${{ matrix.python-version}})
Expand All @@ -98,7 +98,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Upload
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
name: Event File
path: ${{ github.event_path }}
4 changes: 2 additions & 2 deletions .github/workflows/build-and-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ jobs:
if: ${{ matrix.command == 'except_metadata_ingestion' && needs.setup.outputs.backend_change == 'true' }}
run: |
./gradlew -PjavaClassVersionDefault=8 :metadata-integration:java:spark-lineage:compileJava
- uses: actions/upload-artifact@v4
- uses: actions/upload-artifact@v3
if: always()
with:
name: Test Results (build)
Expand Down Expand Up @@ -128,7 +128,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Upload
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
name: Event File
path: ${{ github.event_path }}
4 changes: 2 additions & 2 deletions .github/workflows/dagster-plugin.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ jobs:
- name: pip freeze show list installed
if: always()
run: source metadata-ingestion-modules/dagster-plugin/venv/bin/activate && pip freeze
- uses: actions/upload-artifact@v4
- uses: actions/upload-artifact@v3
if: ${{ always() && matrix.python-version == '3.10' && matrix.extraPythonRequirement == 'dagster>=1.3.3' }}
with:
name: Test Results (dagster Plugin ${{ matrix.python-version}})
Expand All @@ -79,7 +79,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Upload
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
name: Event File
path: ${{ github.event_path }}
6 changes: 3 additions & 3 deletions .github/workflows/docker-unified.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1024,18 +1024,18 @@ jobs:
docker logs datahub-datahub-frontend-react-1 >& frontend-${{ matrix.test_strategy }}.log || true
docker logs datahub-upgrade-1 >& upgrade-${{ matrix.test_strategy }}.log || true
- name: Upload logs
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
if: failure()
with:
name: docker logs
path: "*.log"
- name: Upload screenshots
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
if: failure()
with:
name: cypress-snapshots-${{ matrix.test_strategy }}
path: smoke-test/tests/cypress/cypress/screenshots/
- uses: actions/upload-artifact@v4
- uses: actions/upload-artifact@v3
if: always()
with:
name: Test Results (smoke tests) ${{ matrix.test_strategy }}
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/metadata-ingestion.yml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ jobs:
df -hl
docker image ls
docker system df
- uses: actions/upload-artifact@v4
- uses: actions/upload-artifact@v3
with:
name: Test Results (metadata ingestion ${{ matrix.python-version }})
path: |
Expand All @@ -106,7 +106,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Upload
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
name: Event File
path: ${{ github.event_path }}
4 changes: 2 additions & 2 deletions .github/workflows/metadata-io.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ jobs:
- name: Gradle build (and test)
run: |
./gradlew :metadata-io:test
- uses: actions/upload-artifact@v4
- uses: actions/upload-artifact@v3
if: always()
with:
name: Test Results (metadata-io)
Expand All @@ -78,7 +78,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Upload
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with:
name: Event File
path: ${{ github.event_path }}
4 changes: 2 additions & 2 deletions .github/workflows/spark-smoke-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,14 @@ jobs:
docker logs elasticsearch >& elasticsearch-${{ matrix.test_strategy }}.log || true
docker logs datahub-frontend-react >& frontend-${{ matrix.test_strategy }}.log || true
- name: Upload logs
uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
if: failure()
with:
name: docker logs
path: |
"**/build/container-logs/*.log"
"*.log"
- uses: actions/upload-artifact@v4
- uses: actions/upload-artifact@v3
if: always()
with:
name: Test Results (smoke tests)
Expand Down
16 changes: 8 additions & 8 deletions docker/datahub-ingestion-base/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -85,18 +85,18 @@ RUN apt-get update && apt-get install -y -qq \
RUN if [ $(arch) = "x86_64" ]; then \
mkdir /opt/oracle && \
cd /opt/oracle && \
wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/216000/instantclient-basic-linux.x64-21.6.0.0.0dbru.zip && \
unzip instantclient-basic-linux.x64-21.6.0.0.0dbru.zip && \
rm instantclient-basic-linux.x64-21.6.0.0.0dbru.zip && \
sh -c "echo /opt/oracle/instantclient_21_6 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \
wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/2115000/instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \
unzip instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \
rm instantclient-basic-linux.x64-21.15.0.0.0dbru.zip && \
sh -c "echo /opt/oracle/instantclient_21_15 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \
ldconfig; \
else \
mkdir /opt/oracle && \
cd /opt/oracle && \
wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/191000/instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip && \
unzip instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip && \
rm instantclient-basic-linux.arm64-19.10.0.0.0dbru.zip && \
sh -c "echo /opt/oracle/instantclient_19_10 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \
wget --no-verbose -c https://download.oracle.com/otn_software/linux/instantclient/1923000/instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \
unzip instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \
rm instantclient-basic-linux.arm64-19.23.0.0.0dbru.zip && \
sh -c "echo /opt/oracle/instantclient_19_23 > /etc/ld.so.conf.d/oracle-instantclient.conf" && \
ldconfig; \
fi;

Expand Down
2 changes: 1 addition & 1 deletion docker/datahub-ingestion/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Defining environment
ARG APP_ENV=full
ARG BASE_IMAGE=acryldata/datahub-ingestion-base
ARG DOCKER_VERSION=head
ARG DOCKER_VERSION=head-full
ARG DEBIAN_REPO_URL=https://deb.debian.org/debian
ARG PIP_MIRROR_URL=https://pypi.python.org/simple

Expand Down
24 changes: 24 additions & 0 deletions docker/kafka-setup/env_to_properties.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import os
import re
import sys


def env_to_properties(env_prefix: str, properties_file: str):
pattern = re.compile('(?<=[^_])_(?=[^_])')
props = {}

for (env_name, val) in os.environ.items():
if env_name.startswith(env_prefix):
raw_name = env_name[len(env_prefix):].lower()
prop_dot = '.'.join(pattern.split(raw_name))
props[prop_dot] = val

with open(properties_file, 'a') as f:
for k, v in props.items():
f.writelines(f'{k}={v}\n')


if __name__ == '__main__':
env_prefix = sys.argv[1]
properties_file = sys.argv[2]
env_to_properties(env_prefix, properties_file)
40 changes: 1 addition & 39 deletions docker/kafka-setup/kafka-setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,46 +10,8 @@ fi
. kafka-config.sh

echo "bootstrap.servers=$KAFKA_BOOTSTRAP_SERVER" > $CONNECTION_PROPERTIES_PATH
echo "security.protocol=$KAFKA_PROPERTIES_SECURITY_PROTOCOL" >> $CONNECTION_PROPERTIES_PATH

## Add support for SASL_PLAINTEXT
if [[ $KAFKA_PROPERTIES_SECURITY_PROTOCOL == "SASL_PLAINTEXT" ]]; then
echo "sasl.mechanism=$KAFKA_PROPERTIES_SASL_MECHANISM" >> $CONNECTION_PROPERTIES_PATH
echo "sasl.jaas.config=$KAFKA_PROPERTIES_SASL_JAAS_CONFIG" >> $CONNECTION_PROPERTIES_PATH
echo "sasl.kerberos.service.name=$KAFKA_PROPERTIES_SASL_KERBEROS_SERVICE_NAME" >> $CONNECTION_PROPERTIES_PATH
fi

## Add support for SASL_SSL
if [[ $KAFKA_PROPERTIES_SECURITY_PROTOCOL == "SASL_SSL" ]]; then
echo "sasl.jaas.config=$KAFKA_PROPERTIES_SASL_JAAS_CONFIG" >> $CONNECTION_PROPERTIES_PATH
echo "sasl.mechanism=$KAFKA_PROPERTIES_SASL_MECHANISM" >> $CONNECTION_PROPERTIES_PATH
fi

if [[ $KAFKA_PROPERTIES_SECURITY_PROTOCOL == "SSL" ]]; then
if [[ -n $KAFKA_PROPERTIES_SSL_KEYSTORE_LOCATION ]]; then
echo "ssl.keystore.location=$KAFKA_PROPERTIES_SSL_KEYSTORE_LOCATION" >> $CONNECTION_PROPERTIES_PATH
echo "ssl.keystore.password=$KAFKA_PROPERTIES_SSL_KEYSTORE_PASSWORD" >> $CONNECTION_PROPERTIES_PATH
echo "ssl.key.password=$KAFKA_PROPERTIES_SSL_KEY_PASSWORD" >> $CONNECTION_PROPERTIES_PATH
if [[ -n $KAFKA_PROPERTIES_SSL_KEYSTORE_TYPE ]]; then
echo "ssl.keystore.type=$KAFKA_PROPERTIES_SSL_KEYSTORE_TYPE" >> $CONNECTION_PROPERTIES_PATH
fi
fi
if [[ -n $KAFKA_PROPERTIES_SSL_TRUSTSTORE_LOCATION ]]; then
echo "ssl.truststore.location=$KAFKA_PROPERTIES_SSL_TRUSTSTORE_LOCATION" >> $CONNECTION_PROPERTIES_PATH
if [[ $KAFKA_PROPERTIES_SSL_TRUSTSTORE_TYPE != "PEM" ]]; then
echo "ssl.truststore.password=$KAFKA_PROPERTIES_SSL_TRUSTSTORE_PASSWORD" >> $CONNECTION_PROPERTIES_PATH
fi
if [[ -n $KAFKA_PROPERTIES_SSL_TRUSTSTORE_TYPE ]]; then
echo "ssl.truststore.type=$KAFKA_PROPERTIES_SSL_TRUSTSTORE_TYPE" >> $CONNECTION_PROPERTIES_PATH
fi
fi
echo "ssl.endpoint.identification.algorithm=$KAFKA_PROPERTIES_SSL_ENDPOINT_IDENTIFICATION_ALGORITHM" >> $CONNECTION_PROPERTIES_PATH
fi

# Add support for SASL_CLIENT_CALLBACK_HANDLER_CLASS
if [[ -n "$KAFKA_PROPERTIES_SASL_CLIENT_CALLBACK_HANDLER_CLASS" ]]; then
echo "sasl.client.callback.handler.class=$KAFKA_PROPERTIES_SASL_CLIENT_CALLBACK_HANDLER_CLASS" >> $CONNECTION_PROPERTIES_PATH
fi
python env_to_properties.py KAFKA_PROPERTIES_ $CONNECTION_PROPERTIES_PATH

# cub kafka-ready -c $CONNECTION_PROPERTIES_PATH -b $KAFKA_BOOTSTRAP_SERVER 1 180
. kafka-ready.sh
Expand Down
1 change: 1 addition & 0 deletions metadata-ingestion/src/datahub/cli/get_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ def urn(ctx: Any, urn: Optional[str], aspect: List[str], details: bool) -> None:
entity_urn=urn,
aspects=aspect,
typed=False,
details=details,
),
sort_keys=True,
indent=2,
Expand Down
37 changes: 31 additions & 6 deletions metadata-ingestion/src/datahub/ingestion/source/dbt/dbt_cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,7 @@ class DBTCloudConfig(DBTCommonConfig):

metadata_endpoint: str = Field(
default="https://metadata.cloud.getdbt.com/graphql",
description="The dbt Cloud metadata API endpoint. This is deprecated, and will be removed in a future release. Please use access_url instead.",
deprecated=True,
description="The dbt Cloud metadata API endpoint. If not provided, we will try to infer it from the access_url.",
)

token: str = Field(
Expand All @@ -66,13 +65,39 @@ class DBTCloudConfig(DBTCommonConfig):
@root_validator(pre=True)
def set_metadata_endpoint(cls, values: dict) -> dict:
if values.get("access_url") and not values.get("metadata_endpoint"):
parsed_uri = urlparse(values["access_url"])
values[
"metadata_endpoint"
] = f"{parsed_uri.scheme}://metadata.{parsed_uri.netloc}/graphql"
metadata_endpoint = infer_metadata_endpoint(values["access_url"])
if metadata_endpoint is None:
raise ValueError(
"Unable to infer the metadata endpoint from the access URL. Please provide a metadata endpoint."
)
values["metadata_endpoint"] = metadata_endpoint
return values


def infer_metadata_endpoint(access_url: str) -> Optional[str]:
# See https://docs.getdbt.com/docs/cloud/about-cloud/access-regions-ip-addresses#api-access-urls
# and https://docs.getdbt.com/docs/dbt-cloud-apis/discovery-querying#discovery-api-endpoints

try:
parsed_uri = urlparse(access_url)
assert parsed_uri.scheme is not None
assert parsed_uri.hostname is not None
except Exception as e:
logger.debug(f"Unable to parse access URL {access_url}: {e}", exc_info=e)
return None

if parsed_uri.hostname.endswith(".dbt.com"):
# For cell-based deployments.
# prefix.region.dbt.com -> prefix.metadata.region.dbt.com
hostname_parts = parsed_uri.hostname.split(".", maxsplit=1)
return f"{parsed_uri.scheme}://{hostname_parts[0]}.metadata.{hostname_parts[1]}/graphql"
elif parsed_uri.hostname.endswith(".getdbt.com"):
return f"{parsed_uri.scheme}://metadata.{parsed_uri.netloc}/graphql"
else:
# The self-hosted variants also have the metadata. prefix.
return f"{parsed_uri.scheme}://metadata.{parsed_uri.netloc}/graphql"


_DBT_GRAPHQL_COMMON_FIELDS = """
runId
accountId
Expand Down
25 changes: 21 additions & 4 deletions metadata-ingestion/tests/unit/test_dbt_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@

from datahub.emitter import mce_builder
from datahub.ingestion.api.common import PipelineContext
from datahub.ingestion.source.dbt.dbt_cloud import DBTCloudConfig
from datahub.ingestion.source.dbt.dbt_cloud import (
DBTCloudConfig,
infer_metadata_endpoint,
)
from datahub.ingestion.source.dbt.dbt_core import (
DBTCoreConfig,
DBTCoreSource,
Expand Down Expand Up @@ -366,7 +369,7 @@ def test_dbt_entity_emission_configuration_helpers():

def test_dbt_cloud_config_access_url():
config_dict = {
"access_url": "https://my-dbt-cloud.dbt.com",
"access_url": "https://emea.getdbt.com",
"token": "dummy_token",
"account_id": "123456",
"project_id": "1234567",
Expand All @@ -375,8 +378,8 @@ def test_dbt_cloud_config_access_url():
"target_platform": "dummy_platform",
}
config = DBTCloudConfig.parse_obj(config_dict)
assert config.access_url == "https://my-dbt-cloud.dbt.com"
assert config.metadata_endpoint == "https://metadata.my-dbt-cloud.dbt.com/graphql"
assert config.access_url == "https://emea.getdbt.com"
assert config.metadata_endpoint == "https://metadata.emea.getdbt.com/graphql"


def test_dbt_cloud_config_with_defined_metadata_endpoint():
Expand All @@ -398,6 +401,20 @@ def test_dbt_cloud_config_with_defined_metadata_endpoint():
)


def test_infer_metadata_endpoint() -> None:
assert (
infer_metadata_endpoint("https://cloud.getdbt.com")
== "https://metadata.cloud.getdbt.com/graphql"
)
assert (
infer_metadata_endpoint("https://prefix.us1.dbt.com")
== "https://prefix.metadata.us1.dbt.com/graphql"
)
assert (
infer_metadata_endpoint("http://dbt.corp.internal")
) == "http://metadata.dbt.corp.internal/graphql"


def test_dbt_time_parsing() -> None:
time_formats = [
"2024-03-28T05:56:15.236210Z",
Expand Down

0 comments on commit ddc66f1

Please sign in to comment.