diff --git a/docs-website/markdown-link-check-config.json b/docs-website/markdown-link-check-config.json index 5b98f28501d49..459f03ed98e21 100644 --- a/docs-website/markdown-link-check-config.json +++ b/docs-website/markdown-link-check-config.json @@ -19,6 +19,9 @@ "pattern": "\\.pdl$" }, { + "pattern": "\\.pdl#.*$" + }, + { "pattern":"\\.json$" }, { @@ -31,7 +34,16 @@ "pattern": "\\.svg$" }, { + "pattern": "\\.java$" + }, + { "pattern": "\\.md#.*$" + }, + { + "pattern": "^https://oauth2.googleapis.com/token" + }, + { + "pattern": "^https://login.microsoftonline.com/common/oauth2/na$" } ], "aliveStatusCodes": [200, 206, 0, 999, 400, 401, 403] diff --git a/docs-website/package.json b/docs-website/package.json index 2252eed07fb73..420510459d858 100644 --- a/docs-website/package.json +++ b/docs-website/package.json @@ -17,7 +17,7 @@ "generate": "rm -rf genDocs genStatic && mkdir genDocs genStatic && yarn _generate-docs && mv docs/* genDocs/ && rmdir docs", "generate-rsync": "mkdir -p genDocs genStatic && yarn _generate-docs && rsync -v --checksum -r -h -i --delete docs/ genDocs && rm -rf docs", "lint": "prettier -w generateDocsDir.ts sidebars.js src/pages/index.js", - "lint-check": "prettier -l generateDocsDir.ts sidebars.js src/pages/index.js && find ../docs -name '*.md' -exec markdown-link-check -q {} -c markdown-link-check-config.json \\;", + "lint-check": "prettier -l generateDocsDir.ts sidebars.js src/pages/index.js && find ./genDocs -name '*.md' -exec markdown-link-check -q {} -c markdown-link-check-config.json \\;", "lint-fix": "prettier --write generateDocsDir.ts sidebars.js src/pages/index.js" }, "dependencies": { diff --git a/docs-website/yarn.lock b/docs-website/yarn.lock index 209a57a43dab0..0613fe71ef78e 100644 --- a/docs-website/yarn.lock +++ b/docs-website/yarn.lock @@ -2986,6 +2986,13 @@ dependencies: "@types/node" "*" +"@types/websocket@^1.0.3": + version "1.0.6" + resolved "https://registry.yarnpkg.com/@types/websocket/-/websocket-1.0.6.tgz#ec8dce5915741632ac3a4b1f951b6d4156e32d03" + integrity sha512-JXkliwz93B2cMWOI1ukElQBPN88vMg3CruvW4KVSKpflt3NyNCJImnhIuB/f97rG7kakqRJGFiwkA895Kn02Dg== + dependencies: + "@types/node" "*" + "@types/ws@^8.5.5": version "8.5.5" resolved "https://registry.yarnpkg.com/@types/ws/-/ws-8.5.5.tgz#af587964aa06682702ee6dcbc7be41a80e4b28eb" @@ -7053,7 +7060,6 @@ node-forge@^1: resolved "https://registry.yarnpkg.com/node-forge/-/node-forge-1.3.1.tgz#be8da2af243b2417d5f646a770663a92b7e9ded3" integrity sha512-dPEtOeMvF9VMcYV/1Wb8CPoVAXtp6MKMlcbAt4ddqmGqUJ6fQZFXkNZNkNlfevtNkGtaSoXf/vNNNSvgrdXwtA== - node-gyp-build@^4.3.0: version "4.6.1" resolved "https://registry.yarnpkg.com/node-gyp-build/-/node-gyp-build-4.6.1.tgz#24b6d075e5e391b8d5539d98c7fc5c210cac8a3e" @@ -9903,6 +9909,10 @@ use-sidecar@^1.1.2: detect-node-es "^1.1.0" tslib "^2.0.0" +use-sync-external-store@^1.2.0: + version "1.2.0" + resolved "https://registry.yarnpkg.com/use-sync-external-store/-/use-sync-external-store-1.2.0.tgz#7dbefd6ef3fe4e767a0cf5d7287aacfb5846928a" + integrity sha512-eEgnFxGQ1Ife9bzYs6VLi8/4X6CObHMw9Qr9tPY43iKwsPw8xE8+EFsf/2cFZ5S3esXgpWgtSCtLNS41F+sKPA== utf-8-validate@^5.0.2: version "5.0.10" @@ -9911,12 +9921,6 @@ utf-8-validate@^5.0.2: dependencies: node-gyp-build "^4.3.0" -use-sync-external-store@^1.2.0: - version "1.2.0" - resolved "https://registry.yarnpkg.com/use-sync-external-store/-/use-sync-external-store-1.2.0.tgz#7dbefd6ef3fe4e767a0cf5d7287aacfb5846928a" - integrity sha512-eEgnFxGQ1Ife9bzYs6VLi8/4X6CObHMw9Qr9tPY43iKwsPw8xE8+EFsf/2cFZ5S3esXgpWgtSCtLNS41F+sKPA== - - util-deprecate@^1.0.1, util-deprecate@^1.0.2, util-deprecate@~1.0.1: version "1.0.2" resolved "https://registry.yarnpkg.com/util-deprecate/-/util-deprecate-1.0.2.tgz#450d4dc9fa70de732762fbd2d4a28981419a0ccf" diff --git a/docs/architecture/architecture.md b/docs/architecture/architecture.md index 6a9c1860d71b0..20f18f09d949b 100644 --- a/docs/architecture/architecture.md +++ b/docs/architecture/architecture.md @@ -17,7 +17,7 @@ The figures below describe the high-level architecture of DataHub.

- +

diff --git a/docs/authentication/guides/add-users.md b/docs/authentication/guides/add-users.md index f5dfc83201083..d380cacd6665e 100644 --- a/docs/authentication/guides/add-users.md +++ b/docs/authentication/guides/add-users.md @@ -19,13 +19,13 @@ To do so, navigate to the **Users & Groups** section inside of Settings page. He do not have the correct privileges to invite users, this button will be disabled.

- +

To invite new users, simply share the link with others inside your organization.

- +

When a new user visits the link, they will be directed to a sign up screen where they can create their DataHub account. @@ -37,13 +37,13 @@ and click **Reset user password** inside the menu dropdown on the right hand sid `Manage User Credentials` [Platform Privilege](../../authorization/access-policies-guide.md) in order to reset passwords.

- +

To reset the password, simply share the password reset link with the user who needs to change their password. Password reset links expire after 24 hours.

- +

# Configuring Single Sign-On with OpenID Connect diff --git a/docs/authentication/guides/sso/configure-oidc-react.md b/docs/authentication/guides/sso/configure-oidc-react.md index d27792ce3967b..512d6adbf916f 100644 --- a/docs/authentication/guides/sso/configure-oidc-react.md +++ b/docs/authentication/guides/sso/configure-oidc-react.md @@ -26,7 +26,7 @@ please see [this guide](../jaas.md) to mount a custom user.props file for a JAAS To configure OIDC in React, you will most often need to register yourself as a client with your identity provider (Google, Okta, etc). Each provider may have their own instructions. Provided below are links to examples for Okta, Google, Azure AD, & Keycloak. -- [Registering an App in Okta](https://developer.okta.com/docs/guides/add-an-external-idp/apple/register-app-in-okta/) +- [Registering an App in Okta](https://developer.okta.com/docs/guides/add-an-external-idp/openidconnect/main/) - [OpenID Connect in Google Identity](https://developers.google.com/identity/protocols/oauth2/openid-connect) - [OpenID Connect authentication with Azure Active Directory](https://docs.microsoft.com/en-us/azure/active-directory/fundamentals/auth-oidc) - [Keycloak - Securing Applications and Services Guide](https://www.keycloak.org/docs/latest/securing_apps/) diff --git a/docs/domains.md b/docs/domains.md index c846a753417c5..1b2ebc9d47f39 100644 --- a/docs/domains.md +++ b/docs/domains.md @@ -22,20 +22,20 @@ You can create this privileges by creating a new [Metadata Policy](./authorizati To create a Domain, first navigate to the **Domains** tab in the top-right menu of DataHub.

- +

Once you're on the Domains page, you'll see a list of all the Domains that have been created on DataHub. Additionally, you can view the number of entities inside each Domain.

- +

To create a new Domain, click '+ New Domain'.

- +

Inside the form, you can choose a name for your Domain. Most often, this will align with your business units or groups, for example @@ -48,7 +48,7 @@ for the Domain. This option is useful if you intend to refer to Domains by a com key to be human-readable. Proceed with caution: once you select a custom id, it cannot be easily changed.

- +

By default, you don't need to worry about this. DataHub will auto-generate a unique Domain id for you. @@ -64,7 +64,7 @@ To assign an asset to a Domain, simply navigate to the asset's profile page. At see a 'Domain' section. Click 'Set Domain', and then search for the Domain you'd like to add to. When you're done, click 'Add'.

- +

To remove an asset from a Domain, click the 'x' icon on the Domain tag. @@ -149,27 +149,27 @@ source: Once you've created a Domain, you can use the search bar to find it.

- +

Clicking on the search result will take you to the Domain's profile, where you can edit its description, add / remove owners, and view the assets inside the Domain.

- +

Once you've added assets to a Domain, you can filter search results to limit to those Assets within a particular Domain using the left-side search filters.

- +

On the homepage, you'll also find a list of the most popular Domains in your organization.

- +

## Additional Resources @@ -242,7 +242,6 @@ DataHub supports Tags, Glossary Terms, & Domains as distinct types of Metadata t - **Tags**: Informal, loosely controlled labels that serve as a tool for search & discovery. Assets may have multiple tags. No formal, central management. - **Glossary Terms**: A controlled vocabulary, with optional hierarchy. Terms are typically used to standardize types of leaf-level attributes (i.e. schema fields) for governance. E.g. (EMAIL_PLAINTEXT) - **Domains**: A set of top-level categories. Usually aligned to business units / disciplines to which the assets are most relevant. Central or distributed management. Single Domain assignment per data asset. - *Need more help? Join the conversation in [Slack](http://slack.datahubproject.io)!* ### Related Features diff --git a/docs/how/add-new-aspect.md b/docs/how/add-new-aspect.md index 6ea7256ed75cc..6453812ea327a 100644 --- a/docs/how/add-new-aspect.md +++ b/docs/how/add-new-aspect.md @@ -1,7 +1,7 @@ # How to add a new metadata aspect? Adding a new metadata [aspect](../what/aspect.md) is one of the most common ways to extend an existing [entity](../what/entity.md). -We'll use the [CorpUserEditableInfo](https://github.com/datahub-project/datahub/blob/master/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl) as an example here. +We'll use the CorpUserEditableInfo as an example here. 1. Add the aspect model to the corresponding namespace (e.g. [`com.linkedin.identity`](https://github.com/datahub-project/datahub/tree/master/metadata-models/src/main/pegasus/com/linkedin/identity)) @@ -14,7 +14,7 @@ We'll use the [CorpUserEditableInfo](https://github.com/datahub-project/datahub/ 4. To surface the new aspect at the top-level [resource endpoint](https://linkedin.github.io/rest.li/user_guide/restli_server#writing-resources), extend the resource data model (e.g. [`CorpUser`](https://github.com/datahub-project/datahub/blob/master/gms/api/src/main/pegasus/com/linkedin/identity/CorpUser.pdl)) with an optional field (e.g. [`editableInfo`](https://github.com/datahub-project/datahub/blob/master/gms/api/src/main/pegasus/com/linkedin/identity/CorpUser.pdl#L21)). You'll also need to extend the `toValue` & `toSnapshot` methods of the top-level resource (e.g. [`CorpUsers`](https://github.com/datahub-project/datahub/blob/master/gms/impl/src/main/java/com/linkedin/metadata/resources/identity/CorpUsers.java)) to convert between the snapshot & value models. -5. (Optional) If there's need to update the aspect via API (instead of/in addition to MCE), add a [sub-resource](https://linkedin.github.io/rest.li/user_guide/restli_server#sub-resources) endpoint for the new aspect (e.g. [`CorpUsersEditableInfoResource`](https://github.com/datahub-project/datahub/blob/master/gms/impl/src/main/java/com/linkedin/metadata/resources/identity/CorpUsersEditableInfoResource.java)). The sub-resource endpiont also allows you to retrieve previous versions of the aspect as well as additional metadata such as the audit stamp. +5. (Optional) If there's need to update the aspect via API (instead of/in addition to MCE), add a [sub-resource](https://linkedin.github.io/rest.li/user_guide/restli_server#sub-resources) endpoint for the new aspect (e.g. `CorpUsersEditableInfoResource`). The sub-resource endpiont also allows you to retrieve previous versions of the aspect as well as additional metadata such as the audit stamp. -6. After rebuilding & restarting [gms](https://github.com/datahub-project/datahub/tree/master/gms), [mce-consumer-job](https://github.com/datahub-project/datahub/tree/master/metadata-jobs/mce-consumer-job) & [mae-consumer-job](https://github.com/datahub-project/datahub/tree/master/metadata-jobs/mae-consumer-job), +6. After rebuilding & restarting gms, [mce-consumer-job](https://github.com/datahub-project/datahub/tree/master/metadata-jobs/mce-consumer-job) & [mae-consumer-job](https://github.com/datahub-project/datahub/tree/master/metadata-jobs/mae-consumer-job),z you should be able to start emitting [MCE](../what/mxe.md) with the new aspect and have it automatically ingested & stored in DB. diff --git a/docs/modeling/extending-the-metadata-model.md b/docs/modeling/extending-the-metadata-model.md index a232b034e639b..be2d7d795de70 100644 --- a/docs/modeling/extending-the-metadata-model.md +++ b/docs/modeling/extending-the-metadata-model.md @@ -82,14 +82,14 @@ Because they are aspects, keys need to be annotated with an @Aspect annotation, can be a part of. The key can also be annotated with the two index annotations: @Relationship and @Searchable. This instructs DataHub -infra to use the fields in the key to create relationships and index fields for search. See [Step 3](#step_3) for more details on +infra to use the fields in the key to create relationships and index fields for search. See [Step 3](#step-3-define-custom-aspects-or-attach-existing-aspects-to-your-entity) for more details on the annotation model. **Constraints**: Note that each field in a Key Aspect MUST be of String or Enum type. ### Step 2: Create the new entity with its key aspect -Define the entity within an `entity-registry.yml` file. Depending on your approach, the location of this file may vary. More on that in steps [4](#step-4-choose-a-place-to-store-your-model-extension) and [5](#step_5). +Define the entity within an `entity-registry.yml` file. Depending on your approach, the location of this file may vary. More on that in steps [4](#step-4-choose-a-place-to-store-your-model-extension) and [5](#step-5-attaching-your-non-key-aspects-to-the-entity). Example: ```yaml diff --git a/docs/modeling/metadata-model.md b/docs/modeling/metadata-model.md index e6cc13df6233d..a8958985a0a72 100644 --- a/docs/modeling/metadata-model.md +++ b/docs/modeling/metadata-model.md @@ -433,7 +433,7 @@ aggregation query against a timeseries aspect. The *@TimeseriesField* and the *@TimeseriesFieldCollection* are two new annotations that can be attached to a field of a *Timeseries aspect* that allows it to be part of an aggregatable query. The kinds of aggregations allowed on these annotated fields depends on the type of the field, as well as the kind of aggregation, as -described [here](#Performing-an-aggregation-on-a-Timeseries-aspect.). +described [here](#performing-an-aggregation-on-a-timeseries-aspect). * `@TimeseriesField = {}` - this annotation can be used with any type of non-collection type field of the aspect such as primitive types and records (see the fields *stat*, *strStat* and *strArray* fields @@ -515,7 +515,7 @@ my_emitter = DatahubRestEmitter("http://localhost:8080") my_emitter.emit(mcpw) ``` -###### Performing an aggregation on a Timeseries aspect. +###### Performing an aggregation on a Timeseries aspect Aggreations on timeseries aspects can be performed by the GMS REST API for `/analytics?action=getTimeseriesStats` which accepts the following params. diff --git a/docs/tags.md b/docs/tags.md index 945b514dc7b47..cb08c9fafea49 100644 --- a/docs/tags.md +++ b/docs/tags.md @@ -27,25 +27,25 @@ You can create these privileges by creating a new [Metadata Policy](./authorizat To add a tag at the dataset or container level, simply navigate to the page for that entity and click on the **Add Tag** button.

- +

Type in the name of the tag you want to add. You can add a new tag, or add a tag that already exists (the autocomplete will pull up the tag if it already exists).

- +

Click on the "Add" button and you'll see the tag has been added!

- +

If you would like to add a tag at the schema level, hover over the "Tags" column for a schema until the "Add Tag" button shows up, and then follow the same flow as above.

- +

### Removing a Tag @@ -57,7 +57,7 @@ To remove a tag, simply click on the "X" button in the tag. Then click "Yes" whe You can search for a tag in the search bar, and even filter entities by the presence of a specific tag.

- +

## Additional Resources diff --git a/docs/what/gms.md b/docs/what/gms.md index 9e1cea1b9540e..d7d3499643f46 100644 --- a/docs/what/gms.md +++ b/docs/what/gms.md @@ -2,6 +2,4 @@ Metadata for [entities](entity.md) [onboarded](../modeling/metadata-model.md) to [GMA](gma.md) is served through microservices known as Generalized Metadata Service (GMS). GMS typically provides a [Rest.li](http://rest.li) API and must access the metadata using [GMA DAOs](../architecture/metadata-serving.md). -While a GMS is completely free to define its public APIs, we do provide a list of [resource base classes](https://github.com/datahub-project/datahub-gma/tree/master/restli-resources/src/main/java/com/linkedin/metadata/restli) to leverage for common patterns. - GMA is designed to support a distributed fleet of GMS, each serving a subset of the [GMA graph](graph.md). However, for simplicity we include a single centralized GMS ([datahub-gms](../../gms)) that serves all entities. diff --git a/metadata-ingestion/docs/sources/gcs/README.md b/metadata-ingestion/docs/sources/gcs/README.md index 2d021950e83de..d6bb8147f076d 100644 --- a/metadata-ingestion/docs/sources/gcs/README.md +++ b/metadata-ingestion/docs/sources/gcs/README.md @@ -9,8 +9,8 @@ and uses DataHub S3 Data Lake integration source under the hood. Refer section [ This ingestion source maps the following Source System Concepts to DataHub Concepts: | Source Concept | DataHub Concept | Notes | -| ------------------------------------------ | ------------------------------------------------------------------------------------------ | -------------------- | -| `"Google Cloud Storage"` | [Data Platform](https://datahubproject.io/docs/generated/metamodel/entities/dataPlatform/) | | +| ------------------------------------------ |--------------------------------------------------------------------------------------------| -------------------- | +| `"Google Cloud Storage"` | [Data Platform](https://datahubproject.io/docs/generated/metamodel/entities/dataplatform/) | | | GCS object / Folder containing GCS objects | [Dataset](https://datahubproject.io/docs/generated/metamodel/entities/dataset/) | | | GCS bucket | [Container](https://datahubproject.io/docs/generated/metamodel/entities/container/) | Subtype `GCS bucket` | | GCS folder | [Container](https://datahubproject.io/docs/generated/metamodel/entities/container/) | Subtype `Folder` | diff --git a/metadata-ingestion/docs/sources/kafka-connect/README.md b/metadata-ingestion/docs/sources/kafka-connect/README.md index ac3728b6eacba..5031bff5a3fac 100644 --- a/metadata-ingestion/docs/sources/kafka-connect/README.md +++ b/metadata-ingestion/docs/sources/kafka-connect/README.md @@ -10,11 +10,11 @@ This plugin extracts the following: This ingestion source maps the following Source System Concepts to DataHub Concepts: -| Source Concept | DataHub Concept | Notes | -| --------------------------- | ------------------------------------------------------------- | --------------------------------------------------------------------------- | -| `"kafka-connect"` | [Data Platform](https://datahubproject.io/docs/generated/metamodel/entities/dataPlatform/) | | -| [Connector](https://kafka.apache.org/documentation/#connect_connectorsandtasks) | [DataFlow](https://datahubproject.io/docs/generated/metamodel/entities/dataflow/) | | -| Kafka Topic | [Dataset](https://datahubproject.io/docs/generated/metamodel/entities/dataset/) | | +| Source Concept | DataHub Concept | Notes | +| --------------------------- |--------------------------------------------------------------------------------------------| --------------------------------------------------------------------------- | +| `"kafka-connect"` | [Data Platform](https://datahubproject.io/docs/generated/metamodel/entities/dataplatform/) | | +| [Connector](https://kafka.apache.org/documentation/#connect_connectorsandtasks) | [DataFlow](https://datahubproject.io/docs/generated/metamodel/entities/dataflow/) | | +| Kafka Topic | [Dataset](https://datahubproject.io/docs/generated/metamodel/entities/dataset/) | | ## Current limitations diff --git a/metadata-ingestion/docs/sources/s3/README.md b/metadata-ingestion/docs/sources/s3/README.md index 17fed8a70abb4..8d65e1cf8b943 100644 --- a/metadata-ingestion/docs/sources/s3/README.md +++ b/metadata-ingestion/docs/sources/s3/README.md @@ -6,8 +6,8 @@ To specify the group of files that form a dataset, use `path_specs` configuratio This ingestion source maps the following Source System Concepts to DataHub Concepts: | Source Concept | DataHub Concept | Notes | -| ---------------------------------------- | ------------------------------------------------------------------------------------------ | ------------------- | -| `"s3"` | [Data Platform](https://datahubproject.io/docs/generated/metamodel/entities/dataPlatform/) | | +| ---------------------------------------- |--------------------------------------------------------------------------------------------| ------------------- | +| `"s3"` | [Data Platform](https://datahubproject.io/docs/generated/metamodel/entities/dataplatform/) | | | s3 object / Folder containing s3 objects | [Dataset](https://datahubproject.io/docs/generated/metamodel/entities/dataset/) | | | s3 bucket | [Container](https://datahubproject.io/docs/generated/metamodel/entities/container/) | Subtype `S3 bucket` | | s3 folder | [Container](https://datahubproject.io/docs/generated/metamodel/entities/container/) | Subtype `Folder` | diff --git a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py index 9394a8bba5e0b..6591ef8a3fc4f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py +++ b/metadata-ingestion/src/datahub/ingestion/source/usage/starburst_trino_usage.py @@ -112,9 +112,6 @@ class TrinoUsageSource(Source): #### Prerequsities 1. You need to setup Event Logger which saves audit logs into a Postgres db and setup this db as a catalog in Trino - Here you can find more info about how to setup: - https://docs.starburst.io/354-e/security/event-logger.html#security-event-logger--page-root - https://docs.starburst.io/354-e/security/event-logger.html#analyzing-the-event-log 2. Install starbust-trino-usage plugin Run pip install 'acryl-datahub[starburst-trino-usage]'.