From 2a0200b0477ce5a0c697876b4619484b3caed9d5 Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 21 Sep 2023 14:28:51 -0700 Subject: [PATCH 1/3] feat(ingest): bump acryl-sqlglot (#8882) --- metadata-ingestion/setup.py | 2 +- ...est_select_ambiguous_column_no_schema.json | 31 +++++++++++++++++++ .../unit/sql_parsing/test_sqlglot_lineage.py | 10 ++++++ 3 files changed, 42 insertions(+), 1 deletion(-) create mode 100644 metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py index b9169186174fa..e748461b156ae 100644 --- a/metadata-ingestion/setup.py +++ b/metadata-ingestion/setup.py @@ -138,7 +138,7 @@ def get_long_description(): sqlglot_lib = { # Using an Acryl fork of sqlglot. # https://github.com/tobymao/sqlglot/compare/main...hsheth2:sqlglot:hsheth?expand=1 - "acryl-sqlglot==18.0.2.dev15", + "acryl-sqlglot==18.5.2.dev45", } aws_common = { diff --git a/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json new file mode 100644 index 0000000000000..10f5ee20b0c1f --- /dev/null +++ b/metadata-ingestion/tests/unit/sql_parsing/goldens/test_select_ambiguous_column_no_schema.json @@ -0,0 +1,31 @@ +{ + "query_type": "SELECT", + "in_tables": [ + "urn:li:dataset:(urn:li:dataPlatform:hive,t1,PROD)", + "urn:li:dataset:(urn:li:dataPlatform:hive,t2,PROD)" + ], + "out_tables": [], + "column_lineage": [ + { + "downstream": { + "table": null, + "column": "a" + }, + "upstreams": [] + }, + { + "downstream": { + "table": null, + "column": "b" + }, + "upstreams": [] + }, + { + "downstream": { + "table": null, + "column": "c" + }, + "upstreams": [] + } + ] +} \ No newline at end of file diff --git a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py index 7581d3bac010e..483c1ac4cc7f9 100644 --- a/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py +++ b/metadata-ingestion/tests/unit/sql_parsing/test_sqlglot_lineage.py @@ -208,6 +208,16 @@ def test_select_from_union(): ) +def test_select_ambiguous_column_no_schema(): + assert_sql_result( + """ + select A, B, C from t1 inner join t2 on t1.id = t2.id + """, + dialect="hive", + expected_file=RESOURCE_DIR / "test_select_ambiguous_column_no_schema.json", + ) + + def test_merge_from_union(): # TODO: We don't support merge statements yet, but the union should still get handled. From 5481e19e0a66de0ae3567198c1de11565edfce5c Mon Sep 17 00:00:00 2001 From: Mayuri Nehate <33225191+mayurinehate@users.noreply.github.com> Date: Fri, 22 Sep 2023 03:35:26 +0530 Subject: [PATCH 2/3] feat(ingest): bulk fetch schema info for schema resolver (#8865) Co-authored-by: Harshal Sheth --- .../src/datahub/ingestion/graph/client.py | 426 +++++++++++------- .../ingestion/source/bigquery_v2/bigquery.py | 25 +- .../src/datahub/utilities/sqlglot_lineage.py | 34 ++ 3 files changed, 324 insertions(+), 161 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/graph/client.py b/metadata-ingestion/src/datahub/ingestion/graph/client.py index b371ab181e133..38e965f7f6587 100644 --- a/metadata-ingestion/src/datahub/ingestion/graph/client.py +++ b/metadata-ingestion/src/datahub/ingestion/graph/client.py @@ -44,14 +44,17 @@ TelemetryClientIdClass, ) from datahub.utilities.perf_timer import PerfTimer -from datahub.utilities.urns.dataset_urn import DatasetUrn from datahub.utilities.urns.urn import Urn, guess_entity_type if TYPE_CHECKING: from datahub.ingestion.source.state.entity_removal_state import ( GenericCheckpointState, ) - from datahub.utilities.sqlglot_lineage import SchemaResolver, SqlParsingResult + from datahub.utilities.sqlglot_lineage import ( + GraphQLSchemaMetadata, + SchemaResolver, + SqlParsingResult, + ) logger = logging.getLogger(__name__) @@ -543,129 +546,110 @@ def get_container_urns_by_filter( logger.debug(f"yielding {x['entity']}") yield x["entity"] - def get_urns_by_filter( + def _bulk_fetch_schema_info_by_filter( self, *, - entity_types: Optional[List[str]] = None, platform: Optional[str] = None, platform_instance: Optional[str] = None, env: Optional[str] = None, query: Optional[str] = None, container: Optional[str] = None, status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED, - batch_size: int = 10000, + batch_size: int = 100, extraFilters: Optional[List[SearchFilterRule]] = None, - ) -> Iterable[str]: - """Fetch all urns that match all of the given filters. + ) -> Iterable[Tuple[str, "GraphQLSchemaMetadata"]]: + """Fetch schema info for datasets that match all of the given filters. - Filters are combined conjunctively. If multiple filters are specified, the results will match all of them. - Note that specifying a platform filter will automatically exclude all entity types that do not have a platform. - The same goes for the env filter. + :return: An iterable of (urn, schema info) tuple that match the filters. + """ + types = [_graphql_entity_type("dataset")] - :param entity_types: List of entity types to include. If None, all entity types will be returned. - :param platform: Platform to filter on. If None, all platforms will be returned. - :param platform_instance: Platform instance to filter on. If None, all platform instances will be returned. - :param env: Environment (e.g. PROD, DEV) to filter on. If None, all environments will be returned. - :param query: Query string to filter on. If None, all entities will be returned. - :param container: A container urn that entities must be within. - This works recursively, so it will include entities within sub-containers as well. - If None, all entities will be returned. - Note that this requires browsePathV2 aspects (added in 0.10.4+). - :param status: Filter on the deletion status of the entity. The default is only return non-soft-deleted entities. - :param extraFilters: Additional filters to apply. If specified, the results will match all of the filters. + # Add the query default of * if no query is specified. + query = query or "*" - :return: An iterable of urns that match the filters. - """ + orFilters = self.generate_filter( + platform, platform_instance, env, container, status, extraFilters + ) - types: Optional[List[str]] = None - if entity_types is not None: - if not entity_types: - raise ValueError( - "entity_types cannot be an empty list; use None for all entities" - ) + graphql_query = textwrap.dedent( + """ + query scrollUrnsWithFilters( + $types: [EntityType!], + $query: String!, + $orFilters: [AndFilterInput!], + $batchSize: Int!, + $scrollId: String) { - types = [_graphql_entity_type(entity_type) for entity_type in entity_types] + scrollAcrossEntities(input: { + query: $query, + count: $batchSize, + scrollId: $scrollId, + types: $types, + orFilters: $orFilters, + searchFlags: { + skipHighlighting: true + skipAggregates: true + } + }) { + nextScrollId + searchResults { + entity { + urn + ... on Dataset { + schemaMetadata(version: 0) { + fields { + fieldPath + nativeDataType + } + } + } + } + } + } + } + """ + ) - # Add the query default of * if no query is specified. - query = query or "*" + variables = { + "types": types, + "query": query, + "orFilters": orFilters, + "batchSize": batch_size, + } + + for entity in self._scroll_across_entities(graphql_query, variables): + if entity.get("schemaMetadata"): + yield entity["urn"], entity["schemaMetadata"] + def generate_filter( + self, + platform: Optional[str], + platform_instance: Optional[str], + env: Optional[str], + container: Optional[str], + status: RemovedStatusFilter, + extraFilters: Optional[List[SearchFilterRule]], + ) -> List[Dict[str, List[SearchFilterRule]]]: andFilters: List[SearchFilterRule] = [] # Platform filter. if platform: - andFilters += [ - { - "field": "platform.keyword", - "values": [make_data_platform_urn(platform)], - "condition": "EQUAL", - } - ] + andFilters.append(self._get_platform_filter(platform)) # Platform instance filter. if platform_instance: - if platform: - # Massage the platform instance into a fully qualified urn, if necessary. - platform_instance = make_dataplatform_instance_urn( - platform, platform_instance - ) - - # Warn if platform_instance is not a fully qualified urn. - # TODO: Change this once we have a first-class data platform instance urn type. - if guess_entity_type(platform_instance) != "dataPlatformInstance": - raise ValueError( - f"Invalid data platform instance urn: {platform_instance}" - ) - - andFilters += [ - { - "field": "platformInstance", - "values": [platform_instance], - "condition": "EQUAL", - } - ] + andFilters.append( + self._get_platform_instance_filter(platform, platform_instance) + ) # Browse path v2 filter. if container: - # Warn if container is not a fully qualified urn. - # TODO: Change this once we have a first-class container urn type. - if guess_entity_type(container) != "container": - raise ValueError(f"Invalid container urn: {container}") - - andFilters += [ - { - "field": "browsePathV2", - "values": [container], - "condition": "CONTAIN", - } - ] + andFilters.append(self._get_container_filter(container)) # Status filter. - if status == RemovedStatusFilter.NOT_SOFT_DELETED: - # Subtle: in some cases (e.g. when the dataset doesn't have a status aspect), the - # removed field is simply not present in the ElasticSearch document. Ideally this - # would be a "removed" : "false" filter, but that doesn't work. Instead, we need to - # use a negated filter. - andFilters.append( - { - "field": "removed", - "values": ["true"], - "condition": "EQUAL", - "negated": True, - } - ) - elif status == RemovedStatusFilter.ONLY_SOFT_DELETED: - andFilters.append( - { - "field": "removed", - "values": ["true"], - "condition": "EQUAL", - } - ) - elif status == RemovedStatusFilter.ALL: - # We don't need to add a filter for this case. - pass - else: - raise ValueError(f"Invalid status filter: {status}") + status_filter = self._get_status_filer(status) + if status_filter: + andFilters.append(status_filter) # Extra filters. if extraFilters: @@ -673,33 +657,9 @@ def get_urns_by_filter( orFilters: List[Dict[str, List[SearchFilterRule]]] = [{"and": andFilters}] - # Env filter. + # Env filter if env: - # The env filter is a bit more tricky since it's not always stored - # in the same place in ElasticSearch. - - envOrConditions: List[SearchFilterRule] = [ - # For most entity types, we look at the origin field. - { - "field": "origin", - "value": env, - "condition": "EQUAL", - }, - # For containers, we look at the customProperties field. - # For any containers created after https://github.com/datahub-project/datahub/pull/8027, - # we look for the "env" property. Otherwise, we use the "instance" property. - { - "field": "customProperties", - "value": f"env={env}", - }, - { - "field": "customProperties", - "value": f"instance={env}", - }, - # Note that not all entity types have an env (e.g. dashboards / charts). - # If the env filter is specified, these will be excluded. - ] - + envOrConditions = self._get_env_or_conditions(env) # This matches ALL of the andFilters and at least one of the envOrConditions. orFilters = [ {"and": andFilters["and"] + [extraCondition]} @@ -707,6 +667,52 @@ def get_urns_by_filter( for andFilters in orFilters ] + return orFilters + + def get_urns_by_filter( + self, + *, + entity_types: Optional[List[str]] = None, + platform: Optional[str] = None, + platform_instance: Optional[str] = None, + env: Optional[str] = None, + query: Optional[str] = None, + container: Optional[str] = None, + status: RemovedStatusFilter = RemovedStatusFilter.NOT_SOFT_DELETED, + batch_size: int = 10000, + extraFilters: Optional[List[SearchFilterRule]] = None, + ) -> Iterable[str]: + """Fetch all urns that match all of the given filters. + + Filters are combined conjunctively. If multiple filters are specified, the results will match all of them. + Note that specifying a platform filter will automatically exclude all entity types that do not have a platform. + The same goes for the env filter. + + :param entity_types: List of entity types to include. If None, all entity types will be returned. + :param platform: Platform to filter on. If None, all platforms will be returned. + :param platform_instance: Platform instance to filter on. If None, all platform instances will be returned. + :param env: Environment (e.g. PROD, DEV) to filter on. If None, all environments will be returned. + :param query: Query string to filter on. If None, all entities will be returned. + :param container: A container urn that entities must be within. + This works recursively, so it will include entities within sub-containers as well. + If None, all entities will be returned. + Note that this requires browsePathV2 aspects (added in 0.10.4+). + :param status: Filter on the deletion status of the entity. The default is only return non-soft-deleted entities. + :param extraFilters: Additional filters to apply. If specified, the results will match all of the filters. + + :return: An iterable of urns that match the filters. + """ + + types = self._get_types(entity_types) + + # Add the query default of * if no query is specified. + query = query or "*" + + # Env filter. + orFilters = self.generate_filter( + platform, platform_instance, env, container, status, extraFilters + ) + graphql_query = textwrap.dedent( """ query scrollUrnsWithFilters( @@ -738,18 +744,26 @@ def get_urns_by_filter( """ ) + variables = { + "types": types, + "query": query, + "orFilters": orFilters, + "batchSize": batch_size, + } + + for entity in self._scroll_across_entities(graphql_query, variables): + yield entity["urn"] + + def _scroll_across_entities( + self, graphql_query: str, variables_orig: dict + ) -> Iterable[dict]: + variables = variables_orig.copy() first_iter = True scroll_id: Optional[str] = None while first_iter or scroll_id: first_iter = False + variables["scrollId"] = scroll_id - variables = { - "types": types, - "query": query, - "orFilters": orFilters, - "batchSize": batch_size, - "scrollId": scroll_id, - } response = self.execute_graphql( graphql_query, variables=variables, @@ -757,13 +771,116 @@ def get_urns_by_filter( data = response["scrollAcrossEntities"] scroll_id = data["nextScrollId"] for entry in data["searchResults"]: - yield entry["entity"]["urn"] + yield entry["entity"] if scroll_id: logger.debug( f"Scrolling to next scrollAcrossEntities page: {scroll_id}" ) + def _get_env_or_conditions(self, env: str) -> List[SearchFilterRule]: + # The env filter is a bit more tricky since it's not always stored + # in the same place in ElasticSearch. + return [ + # For most entity types, we look at the origin field. + { + "field": "origin", + "value": env, + "condition": "EQUAL", + }, + # For containers, we look at the customProperties field. + # For any containers created after https://github.com/datahub-project/datahub/pull/8027, + # we look for the "env" property. Otherwise, we use the "instance" property. + { + "field": "customProperties", + "value": f"env={env}", + }, + { + "field": "customProperties", + "value": f"instance={env}", + }, + # Note that not all entity types have an env (e.g. dashboards / charts). + # If the env filter is specified, these will be excluded. + ] + + def _get_status_filer( + self, status: RemovedStatusFilter + ) -> Optional[SearchFilterRule]: + if status == RemovedStatusFilter.NOT_SOFT_DELETED: + # Subtle: in some cases (e.g. when the dataset doesn't have a status aspect), the + # removed field is simply not present in the ElasticSearch document. Ideally this + # would be a "removed" : "false" filter, but that doesn't work. Instead, we need to + # use a negated filter. + return { + "field": "removed", + "values": ["true"], + "condition": "EQUAL", + "negated": True, + } + + elif status == RemovedStatusFilter.ONLY_SOFT_DELETED: + return { + "field": "removed", + "values": ["true"], + "condition": "EQUAL", + } + + elif status == RemovedStatusFilter.ALL: + # We don't need to add a filter for this case. + return None + else: + raise ValueError(f"Invalid status filter: {status}") + + def _get_container_filter(self, container: str) -> SearchFilterRule: + # Warn if container is not a fully qualified urn. + # TODO: Change this once we have a first-class container urn type. + if guess_entity_type(container) != "container": + raise ValueError(f"Invalid container urn: {container}") + + return { + "field": "browsePathV2", + "values": [container], + "condition": "CONTAIN", + } + + def _get_platform_instance_filter( + self, platform: Optional[str], platform_instance: str + ) -> SearchFilterRule: + if platform: + # Massage the platform instance into a fully qualified urn, if necessary. + platform_instance = make_dataplatform_instance_urn( + platform, platform_instance + ) + + # Warn if platform_instance is not a fully qualified urn. + # TODO: Change this once we have a first-class data platform instance urn type. + if guess_entity_type(platform_instance) != "dataPlatformInstance": + raise ValueError(f"Invalid data platform instance urn: {platform_instance}") + + return { + "field": "platformInstance", + "values": [platform_instance], + "condition": "EQUAL", + } + + def _get_platform_filter(self, platform: str) -> SearchFilterRule: + return { + "field": "platform.keyword", + "values": [make_data_platform_urn(platform)], + "condition": "EQUAL", + } + + def _get_types(self, entity_types: Optional[List[str]]) -> Optional[List[str]]: + types: Optional[List[str]] = None + if entity_types is not None: + if not entity_types: + raise ValueError( + "entity_types cannot be an empty list; use None for all entities" + ) + + types = [_graphql_entity_type(entity_type) for entity_type in entity_types] + return types + def get_latest_pipeline_checkpoint( self, pipeline_name: str, platform: str ) -> Optional[Checkpoint["GenericCheckpointState"]]: @@ -1033,43 +1150,36 @@ def initialize_schema_resolver_from_datahub( self, platform: str, platform_instance: Optional[str], env: str ) -> Tuple["SchemaResolver", Set[str]]: logger.info("Initializing schema resolver") - - # TODO: Filter on platform instance? - logger.info(f"Fetching urns for platform {platform}, env {env}") - with PerfTimer() as timer: - urns = set( - self.get_urns_by_filter( - entity_types=[DatasetUrn.ENTITY_TYPE], - platform=platform, - env=env, - batch_size=3000, - ) - ) - logger.info( - f"Fetched {len(urns)} urns in {timer.elapsed_seconds()} seconds" - ) - schema_resolver = self._make_schema_resolver( platform, platform_instance, env, include_graph=False ) + + logger.info(f"Fetching schemas for platform {platform}, env {env}") + urns = [] + count = 0 with PerfTimer() as timer: - count = 0 - for i, urn in enumerate(urns): - if i % 1000 == 0: - logger.debug(f"Loaded {i} schema metadata") + for urn, schema_info in self._bulk_fetch_schema_info_by_filter( + platform=platform, + platform_instance=platform_instance, + env=env, + ): try: - schema_metadata = self.get_aspect(urn, SchemaMetadataClass) - if schema_metadata: - schema_resolver.add_schema_metadata(urn, schema_metadata) - count += 1 + urns.append(urn) + schema_resolver.add_graphql_schema_metadata(urn, schema_info) + count += 1 except Exception: - logger.warning("Failed to load schema metadata", exc_info=True) + logger.warning("Failed to add schema info", exc_info=True) + + if count % 1000 == 0: + logger.debug( + f"Loaded {count} schema info in {timer.elapsed_seconds()} seconds" + ) logger.info( - f"Loaded {count} schema metadata in {timer.elapsed_seconds()} seconds" + f"Finished loading total {count} schema info in {timer.elapsed_seconds()} seconds" ) logger.info("Finished initializing schema resolver") - return schema_resolver, urns + return schema_resolver, set(urns) def parse_sql_lineage( self, diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py index ae49a4ba17c11..8a16b1a4a5f6b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py @@ -285,9 +285,7 @@ def __init__(self, ctx: PipelineContext, config: BigQueryV2Config): # Maps view ref -> actual sql self.view_definitions: FileBackedDict[str] = FileBackedDict() - self.sql_parser_schema_resolver = SchemaResolver( - platform=self.platform, env=self.config.env - ) + self.sql_parser_schema_resolver = self._init_schema_resolver() self.add_config_to_report() atexit.register(cleanup, config) @@ -446,6 +444,27 @@ def test_connection(config_dict: dict) -> TestConnectionReport: ) return test_report + def _init_schema_resolver(self) -> SchemaResolver: + schema_resolution_required = ( + self.config.lineage_parse_view_ddl or self.config.lineage_use_sql_parser + ) + schema_ingestion_enabled = ( + self.config.include_views and self.config.include_tables + ) + + if schema_resolution_required and not schema_ingestion_enabled: + if self.ctx.graph: + return self.ctx.graph.initialize_schema_resolver_from_datahub( + platform=self.platform, + platform_instance=self.config.platform_instance, + env=self.config.env, + )[0] + else: + logger.warning( + "Failed to load schema info from DataHub as DataHubGraph is missing.", + ) + return SchemaResolver(platform=self.platform, env=self.config.env) + def get_dataplatform_instance_aspect( self, dataset_urn: str, project_id: str ) -> MetadataWorkUnit: diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py index d677b0874b985..f18235af3d1fd 100644 --- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py +++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py @@ -14,6 +14,7 @@ import sqlglot.optimizer.qualify import sqlglot.optimizer.qualify_columns from pydantic import BaseModel +from typing_extensions import TypedDict from datahub.emitter.mce_builder import ( DEFAULT_ENV, @@ -36,6 +37,15 @@ SQL_PARSE_RESULT_CACHE_SIZE = 1000 +class GraphQLSchemaField(TypedDict): + fieldPath: str + nativeDataType: str + + +class GraphQLSchemaMetadata(TypedDict): + fields: List[GraphQLSchemaField] + + class QueryType(enum.Enum): CREATE = "CREATE" SELECT = "SELECT" @@ -330,6 +340,12 @@ def add_schema_metadata( def add_raw_schema_info(self, urn: str, schema_info: SchemaInfo) -> None: self._save_to_cache(urn, schema_info) + def add_graphql_schema_metadata( + self, urn: str, schema_metadata: GraphQLSchemaMetadata + ) -> None: + schema_info = self.convert_graphql_schema_metadata_to_info(schema_metadata) + self._save_to_cache(urn, schema_info) + def _save_to_cache(self, urn: str, schema_info: Optional[SchemaInfo]) -> None: self._schema_cache[urn] = schema_info @@ -356,6 +372,24 @@ def _convert_schema_aspect_to_info( not in DatasetUrn.get_simple_field_path_from_v2_field_path(col.fieldPath) } + @classmethod + def convert_graphql_schema_metadata_to_info( + cls, schema: GraphQLSchemaMetadata + ) -> SchemaInfo: + return { + DatasetUrn.get_simple_field_path_from_v2_field_path(field["fieldPath"]): ( + # The actual types are more of a "nice to have". + field["nativeDataType"] + or "str" + ) + for field in schema["fields"] + # TODO: We can't generate lineage to columns nested within structs yet. + if "." + not in DatasetUrn.get_simple_field_path_from_v2_field_path( + field["fieldPath"] + ) + } + # TODO add a method to load all from graphql def close(self) -> None: From 4be8fd0905b6631ddf7161ab412719bed786882a Mon Sep 17 00:00:00 2001 From: Harshal Sheth Date: Thu, 21 Sep 2023 15:59:56 -0700 Subject: [PATCH 3/3] fix(docs): remove link-checker from CI (#8883) --- docs-website/markdown-link-check-config.json | 37 ++++++++------------ docs-website/package.json | 6 ++-- 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/docs-website/markdown-link-check-config.json b/docs-website/markdown-link-check-config.json index 26e040edde6f7..2f5a51ada324e 100644 --- a/docs-website/markdown-link-check-config.json +++ b/docs-website/markdown-link-check-config.json @@ -1,50 +1,41 @@ { "ignorePatterns": [ { - "pattern": "^http://demo\\.datahubproject\\.io" + "pattern": "^https?://demo\\.datahubproject\\.io" }, { - "pattern": "^http://localhost" + "pattern": "^http://localhost" }, { - "pattern": "^http://www.famfamfam.com" + "pattern": "^/docs" }, { - "pattern": "^http://www.linkedin.com" + "pattern": "^/integrations" }, { - "pattern": "\\.md$" + "pattern": "^https?://www.linkedin.com" }, { - "pattern":"\\.json$" + "pattern": "\\.md(#.*)?$" }, { - "pattern":"\\.txt$" + "pattern": "\\.json$" }, { - "pattern": "\\.java$" + "pattern": "\\.txt$" }, { - "pattern": "\\.md#.*$" + "pattern": "\\.java$" }, { - "pattern": "^https://oauth2.googleapis.com/token" + "pattern": "^https://oauth2.googleapis.com/token" }, { - "pattern": "^https://login.microsoftonline.com/common/oauth2/na$" + "pattern": "^https://login.microsoftonline.com/common/oauth2/na$" }, { - "pattern": "#v(\\d+)-(\\d+)-(\\d+)" - }, - { - "pattern": "^https://github.com/mohdsiddique$" - }, - { - "pattern": "^https://github.com/2x$" - }, - { - "pattern": "^https://github.com/datahub-project/datahub/assets/15873986/2f47d033-6c2b-483a-951d-e6d6b807f0d0%22%3E$" + "pattern": "^https://github.com/datahub-project/datahub/assets/15873986/2f47d033-6c2b-483a-951d-e6d6b807f0d0%22%3E$" } ], - "aliveStatusCodes": [200, 206, 0, 999, 400, 401, 403] -} \ No newline at end of file + "aliveStatusCodes": [200, 206, 0, 999] +} diff --git a/docs-website/package.json b/docs-website/package.json index 1722f92169692..eca6e5814d3c6 100644 --- a/docs-website/package.json +++ b/docs-website/package.json @@ -17,8 +17,10 @@ "generate": "rm -rf genDocs genStatic && mkdir genDocs genStatic && yarn _generate-docs && mv docs/* genDocs/ && rmdir docs", "generate-rsync": "mkdir -p genDocs genStatic && yarn _generate-docs && rsync -v --checksum -r -h -i --delete docs/ genDocs && rm -rf docs", "lint": "prettier -w generateDocsDir.ts sidebars.js src/pages/index.js", - "lint-check": "prettier -l generateDocsDir.ts sidebars.js src/pages/index.js && find ./genDocs -name \\*.md -not -path \"./genDocs/python-sdk/models.md\" -print0 | xargs -0 -n1 markdown-link-check -p -q -c markdown-link-check-config.json", - "lint-fix": "prettier --write generateDocsDir.ts sidebars.js src/pages/index.js" + "lint-check": "prettier -l generateDocsDir.ts sidebars.js src/pages/index.js", + "lint-fix": "prettier --write generateDocsDir.ts sidebars.js src/pages/index.js", + "_list-link-check-files": "find ./genDocs -name '*.md' -not \\( -path './genDocs/python-sdk/*' -o -path './genDocs/releases.md' \\)", + "check-links": "yarn run -s _list-link-check-files -print0 | xargs -0 -n1 -t markdown-link-check -q -c markdown-link-check-config.json" }, "dependencies": { "@ant-design/icons": "^4.7.0",