Merge branch 'master' into great-expectations-pandas-support

datahub-project · Feb 12, 2024 · 059e6ca · 059e6ca
2 parents ebbe8a5 + 709c596
commit 059e6ca
Show file tree

Hide file tree

Showing 5 changed files with 58 additions and 30 deletions.
diff --git a/docs/developers.md b/docs/developers.md
@@ -12,8 +12,6 @@ title: "Local Development"
 - [Docker Compose >=2.20](https://docs.docker.com/compose/)
 - Docker engine with at least 8GB of memory to run tests.
 
-:::
-
 On macOS, these can be installed using [Homebrew](https://brew.sh/).
 
 ```shell

diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi.py b/metadata-ingestion/src/datahub/ingestion/source/openapi.py
@@ -4,9 +4,10 @@
 from abc import ABC
 from typing import Dict, Iterable, Optional, Tuple
 
+from pydantic import validator
 from pydantic.fields import Field
 
-from datahub.configuration.common import ConfigModel
+from datahub.configuration.common import ConfigModel, ConfigurationError
 from datahub.emitter.mce_builder import make_tag_urn
 from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.api.decorators import (
@@ -74,14 +75,33 @@ class OpenApiConfig(ConfigModel):
     token: Optional[str] = Field(
         default=None, description="Token for endpoint authentication."
     )
+    bearer_token: Optional[str] = Field(
+        default=None, description="Bearer token for endpoint authentication."
+    )
     get_token: dict = Field(
         default={}, description="Retrieving a token from the endpoint."
     )
 
+    @validator("bearer_token", always=True)
+    def ensure_only_one_token(
+        cls, bearer_token: Optional[str], values: Dict
+    ) -> Optional[str]:
+        if bearer_token is not None and values.get("token") is not None:
+            raise ConfigurationError(
+                "Unable to use 'token' and 'bearer_token' together."
+            )
+        return bearer_token
+
     def get_swagger(self) -> Dict:
-        if self.get_token or self.token is not None:
-            if self.token is not None:
-                ...
+        if self.get_token or self.token or self.bearer_token is not None:
+            if self.token:
+                pass
+            elif self.bearer_token:
+                # TRICKY: To avoid passing a bunch of different token types around, we set the
+                # token's value to the properly formatted bearer token.
+                # TODO: We should just create a requests.Session and set all the auth
+                # details there once, and then use that session for all requests.
+                self.token = f"Bearer {self.bearer_token}"
             else:
                 assert (
                     "url_complement" in self.get_token.keys()
@@ -283,10 +303,11 @@ def get_workunits_internal(self) -> Iterable[ApiWorkUnit]:  # noqa: C901
                 "{" not in endpoint_k
             ):  # if the API does not explicitly require parameters
                 tot_url = clean_url(config.url + self.url_basepath + endpoint_k)
-
                 if config.token:
                     response = request_call(
-                        tot_url, token=config.token, proxies=config.proxies
+                        tot_url,
+                        token=config.token,
+                        proxies=config.proxies,
                     )
                 else:
                     response = request_call(
@@ -314,7 +335,9 @@ def get_workunits_internal(self) -> Iterable[ApiWorkUnit]:  # noqa: C901
                     tot_url = clean_url(config.url + self.url_basepath + url_guess)
                     if config.token:
                         response = request_call(
-                            tot_url, token=config.token, proxies=config.proxies
+                            tot_url,
+                            token=config.token,
+                            proxies=config.proxies,
                         )
                     else:
                         response = request_call(
@@ -342,7 +365,9 @@ def get_workunits_internal(self) -> Iterable[ApiWorkUnit]:  # noqa: C901
                     tot_url = clean_url(config.url + self.url_basepath + composed_url)
                     if config.token:
                         response = request_call(
-                            tot_url, token=config.token, proxies=config.proxies
+                            tot_url,
+                            token=config.token,
+                            proxies=config.proxies,
                         )
                     else:
                         response = request_call(

diff --git a/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py b/metadata-ingestion/src/datahub/ingestion/source/openapi_parser.py
@@ -54,12 +54,10 @@ def request_call(
     proxies: Optional[dict] = None,
 ) -> requests.Response:
     headers = {"accept": "application/json"}
-
     if username is not None and password is not None:
         return requests.get(
             url, headers=headers, auth=HTTPBasicAuth(username, password)
         )
-
     elif token is not None:
         headers["Authorization"] = f"{token}"
         return requests.get(url, proxies=proxies, headers=headers)
@@ -76,12 +74,9 @@ def get_swag_json(
     proxies: Optional[dict] = None,
 ) -> Dict:
     tot_url = url + swagger_file
-    if token is not None:
-        response = request_call(url=tot_url, token=token, proxies=proxies)
-    else:
-        response = request_call(
-            url=tot_url, username=username, password=password, proxies=proxies
-        )
+    response = request_call(
+        url=tot_url, token=token, username=username, password=password, proxies=proxies
+    )
 
     if response.status_code != 200:
         raise Exception(f"Unable to retrieve {tot_url}, error {response.status_code}")

diff --git a/metadata-jobs/mae-consumer-job/README.md b/metadata-jobs/mae-consumer-job/README.md
@@ -17,35 +17,43 @@ Today the job consumes from two important Kafka topics:
 > Where does the name **Metadata Audit Event** come from? Well, history. Previously, this job consumed
 > a single `MetadataAuditEvent` topic which has been deprecated and removed from the critical path. Hence, the name!
 
-## Pre-requisites
-* You need to have [JDK8](https://www.oracle.com/java/technologies/jdk8-downloads.html)
-installed on your machine to be able to build `DataHub Metadata Service`.
+## Prerequisites
+
+Follow the [main developer guide](../../docs/developers.md) to set up your development environment and install the required dependencies.
 
 ## Build
+
 `Metadata Audit Event Consumer Job` is already built as part of top level build:
-```
+
+```shell
 ./gradlew build
 ```
+
 However, if you only want to build `MAE Consumer Job` specifically:
-```
+
+```shell
 ./gradlew :metadata-jobs:mae-consumer-job:build
 ```
 
 ## Dependencies
-Before starting `Metadata Audit Event Consumer Job`, you need to make sure that [Kafka, Schema Registry & Zookeeper](../../docker/kafka-setup),
-[Elasticsearch](../../docker/elasticsearch), and [Neo4j](../../docker/neo4j) Docker containers are up and running.
+
+Before starting `Metadata Audit Event Consumer Job`, you need to make sure that all backend services, including Kafka and ElasticSearch, are up and running. If GMS is healthy, then Kafka and ElasticSearch should be healthy as well.
 
 ## Start via Docker image
+
 The quickest way to try out `Metadata Audit Event Consumer Job` is running the [Docker image](../../docker/datahub-mae-consumer).
 
 ## Start via command line
+
 If you do modify things and want to try it out quickly without building the Docker image, you can also run
 the application directly from command line after a successful [build](#build):
-```
+
+```shell
 MCL_CONSUMER_ENABLED=true ./gradlew :metadata-jobs:mae-consumer-job:bootRun
 ```
 
 ## Endpoints
+
 Spring boot actuator has been enabled for MAE Application.
 `healthcheck`, `metrics` and `info` web endpoints are enabled by default.
 

diff --git a/metadata-service/README.md b/metadata-service/README.md
@@ -9,16 +9,18 @@ DataHub Metadata Service is a service written in Java consisting of multiple ser
 2. A general-purpose Rest.li API for ingesting the underlying storage models composing the Metadata graph.
 
 ## Pre-requisites
-* You need to have [JDK8](https://www.oracle.com/java/technologies/jdk8-downloads.html) 
-installed on your machine to be able to build `DataHub Metadata Service`.
+
+Follow the [main developer guide](../docs/developers.md) to set up your development environment and install the required dependencies.
 
 ## Build
 `DataHub Metadata Service` is already built as part of top level build:
-```
+
+```shell
 ./gradlew build
 ```
+
 However, if you only want to build `DataHub Metadata Service` specifically:
-```
+```shell
 ./gradlew :metadata-service:war:build
 ```