From 2a0200b0477ce5a0c697876b4619484b3caed9d5 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Thu, 21 Sep 2023 14:28:51 -0700
Subject: [PATCH 1/3] feat(ingest): bump acryl-sqlglot (#8882)

---
 metadata-ingestion/setup.py                   |  2 +-
 ...est_select_ambiguous_column_no_schema.json | 31 +++++++++++++++++++
 .../unit/sql_parsing/test_sqlglot_lineage.py  | 10 ++++++
 3 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json

diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index b9169186174fa..e748461b156ae 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -138,7 +138,7 @@ def get_long_description():
 sqlglot_lib = {
     # Using an Acryl fork of sqlglot.
     # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:hsheth?expand=1
-    "acryl-sqlglot==18.0.2.dev15",
+    "acryl-sqlglot==18.5.2.dev45",
 }
 
 aws_common = {
diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json
new file mode 100644
index 0000000000000..10f5ee20b0c1f
--- /dev/null
+++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json
@@ -0,0 +1,31 @@
+{
+    "query_type": "SELECT",
+    "in_tables": [
+        "urn:li:dataset:(urn:li:dataPlatform:hive,t1,PROD)",
+        "urn:li:dataset:(urn:li:dataPlatform:hive,t2,PROD)"
+    ],
+    "out_tables": [],
+    "column_lineage": [
+        {
+            "downstream": {
+                "table": null,
+                "column": "a"
+            },
+            "upstreams": []
+        },
+        {
+            "downstream": {
+                "table": null,
+                "column": "b"
+            },
+            "upstreams": []
+        },
+        {
+            "downstream": {
+                "table": null,
+                "column": "c"
+            },
+            "upstreams": []
+        }
+    ]
+}
\ No newline at end of file
diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py
index 7581d3bac010e..483c1ac4cc7f9 100644
--- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py
+++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py
@@ -208,6 +208,16 @@ def test_select_from_union():
     )
 
 
+def test_select_ambiguous_column_no_schema():
+    assert_sql_result(
+        """
+        select A, B, C from t1 inner join t2 on t1.id = t2.id
+        """,
+        dialect="hive",
+        expected_file=RESOURCE_DIR / "test_select_ambiguous_column_no_schema.json",
+    )
+
+
 def test_merge_from_union():
     # TODO: We don't support merge statements yet, but the union should still get handled.
 

From 5481e19e0a66de0ae3567198c1de11565edfce5c Mon Sep 17 00:00:00 2001
From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com>
Date: Fri, 22 Sep 2023 03:35:26 +0530
Subject: [PATCH 2/3] feat(ingest): bulk fetch schema info for schema resolver
 (#8865)

Co-authored-by: Harshal Sheth <hsheth2@gmail.com>
---
 .../src/datahub/ingestion/graph/client.py     | 426 +++++++++++-------
 .../ingestion/source/bigquery_v2/bigquery.py  |  25 +-
 .../src/datahub/utilities/sqlglot_lineage.py  |  34 ++
 3 files changed, 324 insertions(+), 161 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py
index b371ab181e133..38e965f7f6587 100644
--- a/metadata-ingestion/src/datahub/ingestion/graph/client.py
+++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py
@@ -44,14 +44,17 @@
     TelemetryClientIdClass,
 )
 from datahub.utilities.perf_timer import PerfTimer
-from datahub.utilities.urns.dataset_urn import DatasetUrn
 from datahub.utilities.urns.urn import Urn, guess_entity_type
 
 if TYPE_CHECKING:
     from datahub.ingestion.source.state.entity_removal_state import (
         GenericCheckpointState,
     )
-    from datahub.utilities.sqlglot_lineage import SchemaResolver, SqlParsingResult
+    from datahub.utilities.sqlglot_lineage import (
+        GraphQLSchemaMetadata,
+        SchemaResolver,
+        SqlParsingResult,
+    )
 
 
 logger = logging.getLogger(__name__)
@@ -543,129 +546,110 @@ def get_container_urns_by_filter(
             logger.debug(f"yielding {x['entity']}")
             yield x["entity"]
 
-    def get_urns_by_filter(
+    def _bulk_fetch_schema_info_by_filter(
         self,
         *,
-        entity_types: Optional[List[str]] = None,
         platform: Optional[str] = None,
         platform_instance: Optional[str] = None,
         env: Optional[str] = None,
         query: Optional[str] = None,
         container: Optional[str] = None,
         status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
-        batch_size: int = 10000,
+        batch_size: int = 100,
         extraFilters: Optional[List[SearchFilterRule]] = None,
-    ) -> Iterable[str]:
-        """Fetch all urns that match all of the given filters.
+    ) -> Iterable[Tuple[str, "GraphQLSchemaMetadata"]]:
+        """Fetch schema info for datasets that match all of the given filters.
 
-        Filters are combined conjunctively. If multiple filters are specified, the results will match all of them.
-        Note that specifying a platform filter will automatically exclude all entity types that do not have a platform.
-        The same goes for the env filter.
+        :return: An iterable of (urn, schema info) tuple that match the filters.
+        """
+        types = [_graphql_entity_type("dataset")]
 
-        :param entity_types: List of entity types to include. If None, all entity types will be returned.
-        :param platform: Platform to filter on. If None, all platforms will be returned.
-        :param platform_instance: Platform instance to filter on. If None, all platform instances will be returned.
-        :param env: Environment (e.g. PROD, DEV) to filter on. If None, all environments will be returned.
-        :param query: Query string to filter on. If None, all entities will be returned.
-        :param container: A container urn that entities must be within.
-            This works recursively, so it will include entities within sub-containers as well.
-            If None, all entities will be returned.
-            Note that this requires browsePathV2 aspects (added in 0.10.4+).
-        :param status: Filter on the deletion status of the entity. The default is only return non-soft-deleted entities.
-        :param extraFilters: Additional filters to apply. If specified, the results will match all of the filters.
+        # Add the query default of * if no query is specified.
+        query = query or "*"
 
-        :return: An iterable of urns that match the filters.
-        """
+        orFilters = self.generate_filter(
+            platform, platform_instance, env, container, status, extraFilters
+        )
 
-        types: Optional[List[str]] = None
-        if entity_types is not None:
-            if not entity_types:
-                raise ValueError(
-                    "entity_types cannot be an empty list; use None for all entities"
-                )
+        graphql_query = textwrap.dedent(
+            """
+            query scrollUrnsWithFilters(
+                $types: [EntityType!],
+                $query: String!,
+                $orFilters: [AndFilterInput!],
+                $batchSize: Int!,
+                $scrollId: String) {
 
-            types = [_graphql_entity_type(entity_type) for entity_type in entity_types]
+                scrollAcrossEntities(input: {
+                    query: $query,
+                    count: $batchSize,
+                    scrollId: $scrollId,
+                    types: $types,
+                    orFilters: $orFilters,
+                    searchFlags: {
+                        skipHighlighting: true
+                        skipAggregates: true
+                    }
+                }) {
+                    nextScrollId
+                    searchResults {
+                        entity {
+                            urn
+                            ... on Dataset {
+                                schemaMetadata(version: 0) {
+                                    fields {
+                                        fieldPath
+                                        nativeDataType
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+            """
+        )
 
-        # Add the query default of * if no query is specified.
-        query = query or "*"
+        variables = {
+            "types": types,
+            "query": query,
+            "orFilters": orFilters,
+            "batchSize": batch_size,
+        }
+
+        for entity in self._scroll_across_entities(graphql_query, variables):
+            if entity.get("schemaMetadata"):
+                yield entity["urn"], entity["schemaMetadata"]
 
+    def generate_filter(
+        self,
+        platform: Optional[str],
+        platform_instance: Optional[str],
+        env: Optional[str],
+        container: Optional[str],
+        status: RemovedStatusFilter,
+        extraFilters: Optional[List[SearchFilterRule]],
+    ) -> List[Dict[str, List[SearchFilterRule]]]:
         andFilters: List[SearchFilterRule] = []
 
         # Platform filter.
         if platform:
-            andFilters += [
-                {
-                    "field": "platform.keyword",
-                    "values": [make_data_platform_urn(platform)],
-                    "condition": "EQUAL",
-                }
-            ]
+            andFilters.append(self._get_platform_filter(platform))
 
         # Platform instance filter.
         if platform_instance:
-            if platform:
-                # Massage the platform instance into a fully qualified urn, if necessary.
-                platform_instance = make_dataplatform_instance_urn(
-                    platform, platform_instance
-                )
-
-            # Warn if platform_instance is not a fully qualified urn.
-            # TODO: Change this once we have a first-class data platform instance urn type.
-            if guess_entity_type(platform_instance) != "dataPlatformInstance":
-                raise ValueError(
-                    f"Invalid data platform instance urn: {platform_instance}"
-                )
-
-            andFilters += [
-                {
-                    "field": "platformInstance",
-                    "values": [platform_instance],
-                    "condition": "EQUAL",
-                }
-            ]
+            andFilters.append(
+                self._get_platform_instance_filter(platform, platform_instance)
+            )
 
         # Browse path v2 filter.
         if container:
-            # Warn if container is not a fully qualified urn.
-            # TODO: Change this once we have a first-class container urn type.
-            if guess_entity_type(container) != "container":
-                raise ValueError(f"Invalid container urn: {container}")
-
-            andFilters += [
-                {
-                    "field": "browsePathV2",
-                    "values": [container],
-                    "condition": "CONTAIN",
-                }
-            ]
+            andFilters.append(self._get_container_filter(container))
 
         # Status filter.
-        if status == RemovedStatusFilter.NOT_SOFT_DELETED:
-            # Subtle: in some cases (e.g. when the dataset doesn't have a status aspect), the
-            # removed field is simply not present in the ElasticSearch document. Ideally this
-            # would be a "removed" : "false" filter, but that doesn't work. Instead, we need to
-            # use a negated filter.
-            andFilters.append(
-                {
-                    "field": "removed",
-                    "values": ["true"],
-                    "condition": "EQUAL",
-                    "negated": True,
-                }
-            )
-        elif status == RemovedStatusFilter.ONLY_SOFT_DELETED:
-            andFilters.append(
-                {
-                    "field": "removed",
-                    "values": ["true"],
-                    "condition": "EQUAL",
-                }
-            )
-        elif status == RemovedStatusFilter.ALL:
-            # We don't need to add a filter for this case.
-            pass
-        else:
-            raise ValueError(f"Invalid status filter: {status}")
+        status_filter = self._get_status_filer(status)
+        if status_filter:
+            andFilters.append(status_filter)
 
         # Extra filters.
         if extraFilters:
@@ -673,33 +657,9 @@ def get_urns_by_filter(
 
         orFilters: List[Dict[str, List[SearchFilterRule]]] = [{"and": andFilters}]
 
-        # Env filter.
+        # Env filter
         if env:
-            # The env filter is a bit more tricky since it's not always stored
-            # in the same place in ElasticSearch.
-
-            envOrConditions: List[SearchFilterRule] = [
-                # For most entity types, we look at the origin field.
-                {
-                    "field": "origin",
-                    "value": env,
-                    "condition": "EQUAL",
-                },
-                # For containers, we look at the customProperties field.
-                # For any containers created after https://github.com/datahub-project/datahub/pull/8027,
-                # we look for the "env" property. Otherwise, we use the "instance" property.
-                {
-                    "field": "customProperties",
-                    "value": f"env={env}",
-                },
-                {
-                    "field": "customProperties",
-                    "value": f"instance={env}",
-                },
-                # Note that not all entity types have an env (e.g. dashboards / charts).
-                # If the env filter is specified, these will be excluded.
-            ]
-
+            envOrConditions = self._get_env_or_conditions(env)
             # This matches ALL of the andFilters and at least one of the envOrConditions.
             orFilters = [
                 {"and": andFilters["and"] + [extraCondition]}
@@ -707,6 +667,52 @@ def get_urns_by_filter(
                 for andFilters in orFilters
             ]
 
+        return orFilters
+
+    def get_urns_by_filter(
+        self,
+        *,
+        entity_types: Optional[List[str]] = None,
+        platform: Optional[str] = None,
+        platform_instance: Optional[str] = None,
+        env: Optional[str] = None,
+        query: Optional[str] = None,
+        container: Optional[str] = None,
+        status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED,
+        batch_size: int = 10000,
+        extraFilters: Optional[List[SearchFilterRule]] = None,
+    ) -> Iterable[str]:
+        """Fetch all urns that match all of the given filters.
+
+        Filters are combined conjunctively. If multiple filters are specified, the results will match all of them.
+        Note that specifying a platform filter will automatically exclude all entity types that do not have a platform.
+        The same goes for the env filter.
+
+        :param entity_types: List of entity types to include. If None, all entity types will be returned.
+        :param platform: Platform to filter on. If None, all platforms will be returned.
+        :param platform_instance: Platform instance to filter on. If None, all platform instances will be returned.
+        :param env: Environment (e.g. PROD, DEV) to filter on. If None, all environments will be returned.
+        :param query: Query string to filter on. If None, all entities will be returned.
+        :param container: A container urn that entities must be within.
+            This works recursively, so it will include entities within sub-containers as well.
+            If None, all entities will be returned.
+            Note that this requires browsePathV2 aspects (added in 0.10.4+).
+        :param status: Filter on the deletion status of the entity. The default is only return non-soft-deleted entities.
+        :param extraFilters: Additional filters to apply. If specified, the results will match all of the filters.
+
+        :return: An iterable of urns that match the filters.
+        """
+
+        types = self._get_types(entity_types)
+
+        # Add the query default of * if no query is specified.
+        query = query or "*"
+
+        # Env filter.
+        orFilters = self.generate_filter(
+            platform, platform_instance, env, container, status, extraFilters
+        )
+
         graphql_query = textwrap.dedent(
             """
             query scrollUrnsWithFilters(
@@ -738,18 +744,26 @@ def get_urns_by_filter(
             """
         )
 
+        variables = {
+            "types": types,
+            "query": query,
+            "orFilters": orFilters,
+            "batchSize": batch_size,
+        }
+
+        for entity in self._scroll_across_entities(graphql_query, variables):
+            yield entity["urn"]
+
+    def _scroll_across_entities(
+        self, graphql_query: str, variables_orig: dict
+    ) -> Iterable[dict]:
+        variables = variables_orig.copy()
         first_iter = True
         scroll_id: Optional[str] = None
         while first_iter or scroll_id:
             first_iter = False
+            variables["scrollId"] = scroll_id
 
-            variables = {
-                "types": types,
-                "query": query,
-                "orFilters": orFilters,
-                "batchSize": batch_size,
-                "scrollId": scroll_id,
-            }
             response = self.execute_graphql(
                 graphql_query,
                 variables=variables,
@@ -757,13 +771,116 @@ def get_urns_by_filter(
             data = response["scrollAcrossEntities"]
             scroll_id = data["nextScrollId"]
             for entry in data["searchResults"]:
-                yield entry["entity"]["urn"]
+                yield entry["entity"]
 
             if scroll_id:
                 logger.debug(
                     f"Scrolling to next scrollAcrossEntities page: {scroll_id}"
                 )
 
+    def _get_env_or_conditions(self, env: str) -> List[SearchFilterRule]:
+        # The env filter is a bit more tricky since it's not always stored
+        # in the same place in ElasticSearch.
+        return [
+            # For most entity types, we look at the origin field.
+            {
+                "field": "origin",
+                "value": env,
+                "condition": "EQUAL",
+            },
+            # For containers, we look at the customProperties field.
+            # For any containers created after https://github.com/datahub-project/datahub/pull/8027,
+            # we look for the "env" property. Otherwise, we use the "instance" property.
+            {
+                "field": "customProperties",
+                "value": f"env={env}",
+            },
+            {
+                "field": "customProperties",
+                "value": f"instance={env}",
+            },
+            # Note that not all entity types have an env (e.g. dashboards / charts).
+            # If the env filter is specified, these will be excluded.
+        ]
+
+    def _get_status_filer(
+        self, status: RemovedStatusFilter
+    ) -> Optional[SearchFilterRule]:
+        if status == RemovedStatusFilter.NOT_SOFT_DELETED:
+            # Subtle: in some cases (e.g. when the dataset doesn't have a status aspect), the
+            # removed field is simply not present in the ElasticSearch document. Ideally this
+            # would be a "removed" : "false" filter, but that doesn't work. Instead, we need to
+            # use a negated filter.
+            return {
+                "field": "removed",
+                "values": ["true"],
+                "condition": "EQUAL",
+                "negated": True,
+            }
+
+        elif status == RemovedStatusFilter.ONLY_SOFT_DELETED:
+            return {
+                "field": "removed",
+                "values": ["true"],
+                "condition": "EQUAL",
+            }
+
+        elif status == RemovedStatusFilter.ALL:
+            # We don't need to add a filter for this case.
+            return None
+        else:
+            raise ValueError(f"Invalid status filter: {status}")
+
+    def _get_container_filter(self, container: str) -> SearchFilterRule:
+        # Warn if container is not a fully qualified urn.
+        # TODO: Change this once we have a first-class container urn type.
+        if guess_entity_type(container) != "container":
+            raise ValueError(f"Invalid container urn: {container}")
+
+        return {
+            "field": "browsePathV2",
+            "values": [container],
+            "condition": "CONTAIN",
+        }
+
+    def _get_platform_instance_filter(
+        self, platform: Optional[str], platform_instance: str
+    ) -> SearchFilterRule:
+        if platform:
+            # Massage the platform instance into a fully qualified urn, if necessary.
+            platform_instance = make_dataplatform_instance_urn(
+                platform, platform_instance
+            )
+
+        # Warn if platform_instance is not a fully qualified urn.
+        # TODO: Change this once we have a first-class data platform instance urn type.
+        if guess_entity_type(platform_instance) != "dataPlatformInstance":
+            raise ValueError(f"Invalid data platform instance urn: {platform_instance}")
+
+        return {
+            "field": "platformInstance",
+            "values": [platform_instance],
+            "condition": "EQUAL",
+        }
+
+    def _get_platform_filter(self, platform: str) -> SearchFilterRule:
+        return {
+            "field": "platform.keyword",
+            "values": [make_data_platform_urn(platform)],
+            "condition": "EQUAL",
+        }
+
+    def _get_types(self, entity_types: Optional[List[str]]) -> Optional[List[str]]:
+        types: Optional[List[str]] = None
+        if entity_types is not None:
+            if not entity_types:
+                raise ValueError(
+                    "entity_types cannot be an empty list; use None for all entities"
+                )
+
+            types = [_graphql_entity_type(entity_type) for entity_type in entity_types]
+        return types
+
     def get_latest_pipeline_checkpoint(
         self, pipeline_name: str, platform: str
     ) -> Optional[Checkpoint["GenericCheckpointState"]]:
@@ -1033,43 +1150,36 @@ def initialize_schema_resolver_from_datahub(
         self, platform: str, platform_instance: Optional[str], env: str
     ) -> Tuple["SchemaResolver", Set[str]]:
         logger.info("Initializing schema resolver")
-
-        # TODO: Filter on platform instance?
-        logger.info(f"Fetching urns for platform {platform}, env {env}")
-        with PerfTimer() as timer:
-            urns = set(
-                self.get_urns_by_filter(
-                    entity_types=[DatasetUrn.ENTITY_TYPE],
-                    platform=platform,
-                    env=env,
-                    batch_size=3000,
-                )
-            )
-            logger.info(
-                f"Fetched {len(urns)} urns in {timer.elapsed_seconds()} seconds"
-            )
-
         schema_resolver = self._make_schema_resolver(
             platform, platform_instance, env, include_graph=False
         )
+
+        logger.info(f"Fetching schemas for platform {platform}, env {env}")
+        urns = []
+        count = 0
         with PerfTimer() as timer:
-            count = 0
-            for i, urn in enumerate(urns):
-                if i % 1000 == 0:
-                    logger.debug(f"Loaded {i} schema metadata")
+            for urn, schema_info in self._bulk_fetch_schema_info_by_filter(
+                platform=platform,
+                platform_instance=platform_instance,
+                env=env,
+            ):
                 try:
-                    schema_metadata = self.get_aspect(urn, SchemaMetadataClass)
-                    if schema_metadata:
-                        schema_resolver.add_schema_metadata(urn, schema_metadata)
-                        count += 1
+                    urns.append(urn)
+                    schema_resolver.add_graphql_schema_metadata(urn, schema_info)
+                    count += 1
                 except Exception:
-                    logger.warning("Failed to load schema metadata", exc_info=True)
+                    logger.warning("Failed to add schema info", exc_info=True)
+
+                if count % 1000 == 0:
+                    logger.debug(
+                        f"Loaded {count} schema info in {timer.elapsed_seconds()} seconds"
+                    )
             logger.info(
-                f"Loaded {count} schema metadata in {timer.elapsed_seconds()} seconds"
+                f"Finished loading total {count} schema info in {timer.elapsed_seconds()} seconds"
             )
 
         logger.info("Finished initializing schema resolver")
-        return schema_resolver, urns
+        return schema_resolver, set(urns)
 
     def parse_sql_lineage(
         self,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
index ae49a4ba17c11..8a16b1a4a5f6b 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py
@@ -285,9 +285,7 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config):
         # Maps view ref -> actual sql
         self.view_definitions: FileBackedDict[str] = FileBackedDict()
 
-        self.sql_parser_schema_resolver = SchemaResolver(
-            platform=self.platform, env=self.config.env
-        )
+        self.sql_parser_schema_resolver = self._init_schema_resolver()
 
         self.add_config_to_report()
         atexit.register(cleanup, config)
@@ -446,6 +444,27 @@ def test_connection(config_dict: dict) -> TestConnectionReport:
             )
             return test_report
 
+    def _init_schema_resolver(self) -> SchemaResolver:
+        schema_resolution_required = (
+            self.config.lineage_parse_view_ddl or self.config.lineage_use_sql_parser
+        )
+        schema_ingestion_enabled = (
+            self.config.include_views and self.config.include_tables
+        )
+
+        if schema_resolution_required and not schema_ingestion_enabled:
+            if self.ctx.graph:
+                return self.ctx.graph.initialize_schema_resolver_from_datahub(
+                    platform=self.platform,
+                    platform_instance=self.config.platform_instance,
+                    env=self.config.env,
+                )[0]
+            else:
+                logger.warning(
+                    "Failed to load schema info from DataHub as DataHubGraph is missing.",
+                )
+        return SchemaResolver(platform=self.platform, env=self.config.env)
+
     def get_dataplatform_instance_aspect(
         self, dataset_urn: str, project_id: str
     ) -> MetadataWorkUnit:
diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
index d677b0874b985..f18235af3d1fd 100644
--- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
+++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
@@ -14,6 +14,7 @@
 import sqlglot.optimizer.qualify
 import sqlglot.optimizer.qualify_columns
 from pydantic import BaseModel
+from typing_extensions import TypedDict
 
 from datahub.emitter.mce_builder import (
     DEFAULT_ENV,
@@ -36,6 +37,15 @@
 SQL_PARSE_RESULT_CACHE_SIZE = 1000
 
 
+class GraphQLSchemaField(TypedDict):
+    fieldPath: str
+    nativeDataType: str
+
+
+class GraphQLSchemaMetadata(TypedDict):
+    fields: List[GraphQLSchemaField]
+
+
 class QueryType(enum.Enum):
     CREATE = "CREATE"
     SELECT = "SELECT"
@@ -330,6 +340,12 @@ def add_schema_metadata(
     def add_raw_schema_info(self, urn: str, schema_info: SchemaInfo) -> None:
         self._save_to_cache(urn, schema_info)
 
+    def add_graphql_schema_metadata(
+        self, urn: str, schema_metadata: GraphQLSchemaMetadata
+    ) -> None:
+        schema_info = self.convert_graphql_schema_metadata_to_info(schema_metadata)
+        self._save_to_cache(urn, schema_info)
+
     def _save_to_cache(self, urn: str, schema_info: Optional[SchemaInfo]) -> None:
         self._schema_cache[urn] = schema_info
 
@@ -356,6 +372,24 @@ def _convert_schema_aspect_to_info(
             not in DatasetUrn.get_simple_field_path_from_v2_field_path(col.fieldPath)
         }
 
+    @classmethod
+    def convert_graphql_schema_metadata_to_info(
+        cls, schema: GraphQLSchemaMetadata
+    ) -> SchemaInfo:
+        return {
+            DatasetUrn.get_simple_field_path_from_v2_field_path(field["fieldPath"]): (
+                # The actual types are more of a "nice to have".
+                field["nativeDataType"]
+                or "str"
+            )
+            for field in schema["fields"]
+            # TODO: We can't generate lineage to columns nested within structs yet.
+            if "."
+            not in DatasetUrn.get_simple_field_path_from_v2_field_path(
+                field["fieldPath"]
+            )
+        }
+
     # TODO add a method to load all from graphql
 
     def close(self) -> None:

From 4be8fd0905b6631ddf7161ab412719bed786882a Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Thu, 21 Sep 2023 15:59:56 -0700
Subject: [PATCH 3/3] fix(docs): remove link-checker from CI (#8883)

---
 docs-website/markdown-link-check-config.json | 37 ++++++++------------
 docs-website/package.json                    |  6 ++--
 2 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/docs-website/markdown-link-check-config.json b/docs-website/markdown-link-check-config.json
index 26e040edde6f7..2f5a51ada324e 100644
--- a/docs-website/markdown-link-check-config.json
+++ b/docs-website/markdown-link-check-config.json
@@ -1,50 +1,41 @@
 {
   "ignorePatterns": [
     {
-    "pattern": "^http://demo\\.datahubproject\\.io"
+      "pattern": "^https?://demo\\.datahubproject\\.io"
     },
     {
-    "pattern": "^http://localhost"
+      "pattern": "^http://localhost"
     },
     {
-    "pattern": "^http://www.famfamfam.com"
+      "pattern": "^/docs"
     },
     {
-    "pattern": "^http://www.linkedin.com"
+      "pattern": "^/integrations"
     },
     {
-    "pattern": "\\.md$"
+      "pattern": "^https?://www.linkedin.com"
     },
     {
-    "pattern":"\\.json$"
+      "pattern": "\\.md(#.*)?$"
     },
     {
-    "pattern":"\\.txt$"
+      "pattern": "\\.json$"
     },
     {
-    "pattern": "\\.java$"
+      "pattern": "\\.txt$"
     },
     {
-    "pattern": "\\.md#.*$"
+      "pattern": "\\.java$"
     },
     {
-    "pattern": "^https://oauth2.googleapis.com/token"
+      "pattern": "^https://oauth2.googleapis.com/token"
     },
     {
-    "pattern": "^https://login.microsoftonline.com/common/oauth2/na$"
+      "pattern": "^https://login.microsoftonline.com/common/oauth2/na$"
     },
     {
-    "pattern": "#v(\\d+)-(\\d+)-(\\d+)"
-    },
-    {
-    "pattern": "^https://github.com/mohdsiddique$"
-    },
-    {
-    "pattern": "^https://github.com/2x$"
-    },
-    {
-    "pattern": "^https://github.com/datahub-project/datahub/assets/15873986/2f47d033-6c2b-483a-951d-e6d6b807f0d0%22%3E$"
+      "pattern": "^https://github.com/datahub-project/datahub/assets/15873986/2f47d033-6c2b-483a-951d-e6d6b807f0d0%22%3E$"
     }
   ],
-  "aliveStatusCodes": [200, 206, 0, 999, 400, 401, 403]
-}
\ No newline at end of file
+  "aliveStatusCodes": [200, 206, 0, 999]
+}
diff --git a/docs-website/package.json b/docs-website/package.json
index 1722f92169692..eca6e5814d3c6 100644
--- a/docs-website/package.json
+++ b/docs-website/package.json
@@ -17,8 +17,10 @@
     "generate": "rm -rf genDocs genStatic && mkdir genDocs genStatic && yarn _generate-docs && mv docs/* genDocs/ && rmdir docs",
     "generate-rsync": "mkdir -p genDocs genStatic && yarn _generate-docs && rsync -v --checksum -r -h -i --delete docs/ genDocs && rm -rf docs",
     "lint": "prettier -w generateDocsDir.ts sidebars.js src/pages/index.js",
-    "lint-check": "prettier -l generateDocsDir.ts sidebars.js src/pages/index.js && find ./genDocs -name \\*.md -not -path \"./genDocs/python-sdk/models.md\" -print0 | xargs -0 -n1 markdown-link-check -p -q -c markdown-link-check-config.json",
-    "lint-fix": "prettier --write generateDocsDir.ts sidebars.js src/pages/index.js"
+    "lint-check": "prettier -l generateDocsDir.ts sidebars.js src/pages/index.js",
+    "lint-fix": "prettier --write generateDocsDir.ts sidebars.js src/pages/index.js",
+    "_list-link-check-files": "find ./genDocs -name '*.md' -not \\( -path './genDocs/python-sdk/*' -o -path './genDocs/releases.md' \\)",
+    "check-links": "yarn run -s _list-link-check-files -print0 | xargs -0 -n1 -t markdown-link-check -q -c markdown-link-check-config.json"
   },
   "dependencies": {
     "@ant-design/icons": "^4.7.0",