diff --git a/docs/developers.md b/docs/developers.md index b378ea282e20f2..980aa3e3acf879 100644 --- a/docs/developers.md +++ b/docs/developers.md @@ -12,8 +12,6 @@ title: "Local Development" - [Docker Compose >=2.20](https://docs.docker.com/compose/) - Docker engine with at least 8GB of memory to run tests. -::: - On macOS, these can be installed using [Homebrew](https://brew.sh/). ```shell diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi.py b/metadata-ingestion/src/datahub/ingestion/source/openapi.py index 1b3a6dc4bee58c..54affafdcc9780 100755 --- a/metadata-ingestion/src/datahub/ingestion/source/openapi.py +++ b/metadata-ingestion/src/datahub/ingestion/source/openapi.py @@ -4,9 +4,10 @@ from abc import ABC from typing import Dict, Iterable, Optional, Tuple +from pydantic import validator from pydantic.fields import Field -from datahub.configuration.common import ConfigModel +from datahub.configuration.common import ConfigModel, ConfigurationError from datahub.emitter.mce_builder import make_tag_urn from datahub.ingestion.api.common import PipelineContext from datahub.ingestion.api.decorators import ( @@ -74,14 +75,33 @@ class OpenApiConfig(ConfigModel): token: Optional[str] = Field( default=None, description="Token for endpoint authentication." ) + bearer_token: Optional[str] = Field( + default=None, description="Bearer token for endpoint authentication." + ) get_token: dict = Field( default={}, description="Retrieving a token from the endpoint." ) + @validator("bearer_token", always=True) + def ensure_only_one_token( + cls, bearer_token: Optional[str], values: Dict + ) -> Optional[str]: + if bearer_token is not None and values.get("token") is not None: + raise ConfigurationError( + "Unable to use 'token' and 'bearer_token' together." + ) + return bearer_token + def get_swagger(self) -> Dict: - if self.get_token or self.token is not None: - if self.token is not None: - ... + if self.get_token or self.token or self.bearer_token is not None: + if self.token: + pass + elif self.bearer_token: + # TRICKY: To avoid passing a bunch of different token types around, we set the + # token's value to the properly formatted bearer token. + # TODO: We should just create a requests.Session and set all the auth + # details there once, and then use that session for all requests. + self.token = f"Bearer {self.bearer_token}" else: assert ( "url_complement" in self.get_token.keys() @@ -283,10 +303,11 @@ def get_workunits_internal(self) -> Iterable[ApiWorkUnit]: # noqa: C901 "{" not in endpoint_k ): # if the API does not explicitly require parameters tot_url = clean_url(config.url + self.url_basepath + endpoint_k) - if config.token: response = request_call( - tot_url, token=config.token, proxies=config.proxies + tot_url, + token=config.token, + proxies=config.proxies, ) else: response = request_call( @@ -314,7 +335,9 @@ def get_workunits_internal(self) -> Iterable[ApiWorkUnit]: # noqa: C901 tot_url = clean_url(config.url + self.url_basepath + url_guess) if config.token: response = request_call( - tot_url, token=config.token, proxies=config.proxies + tot_url, + token=config.token, + proxies=config.proxies, ) else: response = request_call( @@ -342,7 +365,9 @@ def get_workunits_internal(self) -> Iterable[ApiWorkUnit]: # noqa: C901 tot_url = clean_url(config.url + self.url_basepath + composed_url) if config.token: response = request_call( - tot_url, token=config.token, proxies=config.proxies + tot_url, + token=config.token, + proxies=config.proxies, ) else: response = request_call( diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py index c1caca18fefe3f..5bacafaa3f5885 100755 --- a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py +++ b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py @@ -54,12 +54,10 @@ def request_call( proxies: Optional[dict] = None, ) -> requests.Response: headers = {"accept": "application/json"} - if username is not None and password is not None: return requests.get( url, headers=headers, auth=HTTPBasicAuth(username, password) ) - elif token is not None: headers["Authorization"] = f"{token}" return requests.get(url, proxies=proxies, headers=headers) @@ -76,12 +74,9 @@ def get_swag_json( proxies: Optional[dict] = None, ) -> Dict: tot_url = url + swagger_file - if token is not None: - response = request_call(url=tot_url, token=token, proxies=proxies) - else: - response = request_call( - url=tot_url, username=username, password=password, proxies=proxies - ) + response = request_call( + url=tot_url, token=token, username=username, password=password, proxies=proxies + ) if response.status_code != 200: raise Exception(f"Unable to retrieve {tot_url}, error {response.status_code}") diff --git a/metadata-jobs/mae-consumer-job/README.md b/metadata-jobs/mae-consumer-job/README.md index 5f48d99eb736ca..81fec7a00c6e2e 100644 --- a/metadata-jobs/mae-consumer-job/README.md +++ b/metadata-jobs/mae-consumer-job/README.md @@ -17,35 +17,43 @@ Today the job consumes from two important Kafka topics: > Where does the name **Metadata Audit Event** come from? Well, history. Previously, this job consumed > a single `MetadataAuditEvent` topic which has been deprecated and removed from the critical path. Hence, the name! -## Pre-requisites -* You need to have [JDK8](https://www.oracle.com/java/technologies/jdk8-downloads.html) -installed on your machine to be able to build `DataHub Metadata Service`. +## Prerequisites + +Follow the [main developer guide](../../docs/developers.md) to set up your development environment and install the required dependencies. ## Build + `Metadata Audit Event Consumer Job` is already built as part of top level build: -``` + +```shell ./gradlew build ``` + However, if you only want to build `MAE Consumer Job` specifically: -``` + +```shell ./gradlew :metadata-jobs:mae-consumer-job:build ``` ## Dependencies -Before starting `Metadata Audit Event Consumer Job`, you need to make sure that [Kafka, Schema Registry & Zookeeper](../../docker/kafka-setup), -[Elasticsearch](../../docker/elasticsearch), and [Neo4j](../../docker/neo4j) Docker containers are up and running. + +Before starting `Metadata Audit Event Consumer Job`, you need to make sure that all backend services, including Kafka and ElasticSearch, are up and running. If GMS is healthy, then Kafka and ElasticSearch should be healthy as well. ## Start via Docker image + The quickest way to try out `Metadata Audit Event Consumer Job` is running the [Docker image](../../docker/datahub-mae-consumer). ## Start via command line + If you do modify things and want to try it out quickly without building the Docker image, you can also run the application directly from command line after a successful [build](#build): -``` + +```shell MCL_CONSUMER_ENABLED=true ./gradlew :metadata-jobs:mae-consumer-job:bootRun ``` ## Endpoints + Spring boot actuator has been enabled for MAE Application. `healthcheck`, `metrics` and `info` web endpoints are enabled by default. diff --git a/metadata-service/README.md b/metadata-service/README.md index f56f4a809ac8ff..8aec1ecc3ab92a 100644 --- a/metadata-service/README.md +++ b/metadata-service/README.md @@ -9,16 +9,18 @@ DataHub Metadata Service is a service written in Java consisting of multiple ser 2. A general-purpose Rest.li API for ingesting the underlying storage models composing the Metadata graph. ## Pre-requisites -* You need to have [JDK8](https://www.oracle.com/java/technologies/jdk8-downloads.html) -installed on your machine to be able to build `DataHub Metadata Service`. + +Follow the [main developer guide](../docs/developers.md) to set up your development environment and install the required dependencies. ## Build `DataHub Metadata Service` is already built as part of top level build: -``` + +```shell ./gradlew build ``` + However, if you only want to build `DataHub Metadata Service` specifically: -``` +```shell ./gradlew :metadata-service:war:build ```