From a17ed80cf742b0e3f5a04632eac6dc2eec045da6 Mon Sep 17 00:00:00 2001
From: Joshua Eilers <josh.eilers@acryl.io>
Date: Mon, 21 Aug 2023 09:47:05 -0700
Subject: [PATCH 01/11] Fix a few view select issues (#8670)

---
 .../src/app/entity/view/select/ViewSelect.tsx | 23 +++++++++++++------
 .../entity/view/select/ViewSelectHeader.tsx   |  2 +-
 .../src/app/search/SearchBar.tsx              | 10 +++++++-
 3 files changed, 26 insertions(+), 9 deletions(-)
diff --git a/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx b/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx
index 03689460eb02b..eda9b7d7fe2a4 100644
--- a/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx
+++ b/datahub-web-react/src/app/entity/view/select/ViewSelect.tsx
@@ -1,4 +1,4 @@
-import React, { useEffect, useRef, useState } from 'react';
+import React, { CSSProperties, useEffect, useRef, useState } from 'react';
 import { useHistory } from 'react-router';
 import { Select } from 'antd';
 import styled from 'styled-components';
@@ -55,11 +55,21 @@ const ViewSelectContainer = styled.div`
             .ant-select-selection-item {
                 font-weight: 700;
                 font-size: 14px;
+                text-align: left;
             }
         }
     }
 `;
 
+const SelectStyled = styled(Select)`
+    min-width: 90px;
+    max-width: 200px;
+`;
+
+type Props = {
+    dropdownStyle?: CSSProperties;
+};
+
 /**
  * The View Select component allows you to select a View to apply to query on the current page. For example,
  * search, recommendations, and browse.
@@ -69,7 +79,7 @@ const ViewSelectContainer = styled.div`
  *
  * In the event that a user refreshes their browser, the state of the view should be saved as well.
  */
-export const ViewSelect = () => {
+export const ViewSelect = ({ dropdownStyle = {} }: Props) => {
     const history = useHistory();
     const userContext = useUserContext();
     const [isOpen, setIsOpen] = useState(false);
@@ -188,12 +198,11 @@ export const ViewSelect = () => {
 
     return (
         <ViewSelectContainer>
-            <Select
+            <SelectStyled
                 data-testid="view-select"
-                style={{ minWidth: '120px', maxWidth: '200px' }}
                 onChange={() => (selectRef?.current as any)?.blur()}
                 value={(foundSelectedUrn && selectedUrn) || undefined}
-                placeholder="All Entities"
+                placeholder="View all"
                 onSelect={onSelectView}
                 onClear={onClear}
                 ref={selectRef}
@@ -202,8 +211,8 @@ export const ViewSelect = () => {
                 dropdownMatchSelectWidth={false}
                 suffixIcon={<TriangleIcon isOpen={isOpen} />}
                 dropdownStyle={{
-                    position: 'fixed',
                     paddingBottom: 0,
+                    ...dropdownStyle,
                 }}
                 onDropdownVisibleChange={handleDropdownVisibleChange}
                 dropdownRender={(menu) => (
@@ -237,7 +246,7 @@ export const ViewSelect = () => {
                         onClickEditView,
                         onClickPreviewView,
                     })}
-            </Select>
+            </SelectStyled>
             {viewBuilderDisplayState.visible && (
                 <ViewBuilder
                     urn={viewBuilderDisplayState.view?.urn || undefined}
diff --git a/datahub-web-react/src/app/entity/view/select/ViewSelectHeader.tsx b/datahub-web-react/src/app/entity/view/select/ViewSelectHeader.tsx
index fcd0434bb4d02..40640a7162427 100644
--- a/datahub-web-react/src/app/entity/view/select/ViewSelectHeader.tsx
+++ b/datahub-web-react/src/app/entity/view/select/ViewSelectHeader.tsx
@@ -42,7 +42,7 @@ export const ViewSelectHeader = ({ onClickClear }: Props) => {
                 ref={clearButtonRef}
                 onClick={onHandleClickClear}
             >
-                All Entities
+                View all
             </AllEntitiesButton>
         </ButtonContainer>
     );
diff --git a/datahub-web-react/src/app/search/SearchBar.tsx b/datahub-web-react/src/app/search/SearchBar.tsx
index 7dbf3c55d021d..fb10e1ca0026e 100644
--- a/datahub-web-react/src/app/search/SearchBar.tsx
+++ b/datahub-web-react/src/app/search/SearchBar.tsx
@@ -377,7 +377,15 @@ export const SearchBar = ({
                                     onKeyUp={handleStopPropagation}
                                     onKeyDown={handleStopPropagation}
                                 >
-                                    <ViewSelect />
+                                    <ViewSelect
+                                        dropdownStyle={
+                                            fixAutoComplete
+                                                ? {
+                                                      position: 'fixed',
+                                                  }
+                                                : {}
+                                        }
+                                    />
                                 </ViewSelectContainer>
                             )}
                             <SearchIcon

From 8cf299aeb43fa95afb22fefbc7728117c727f0b3 Mon Sep 17 00:00:00 2001
From: Indy Prentice <iprentic@users.noreply.github.com>
Date: Mon, 21 Aug 2023 15:33:10 -0300
Subject: [PATCH 02/11] feat(search): Add word gram analyzer for name fields
 (#8611)

Co-authored-by: Indy Prentice <indy@Indys-MacBook-Pro.local>
---
 docs/advanced/no-code-modeling.md             |   2 +-
 docs/modeling/extending-the-metadata-model.md |  21 +--
 .../annotation/SearchableAnnotation.java      |   5 +-
 .../models/EntitySpecBuilderTest.java         |   7 +-
 .../indexbuilder/MappingsBuilder.java         |  21 ++-
 .../indexbuilder/SettingsBuilder.java         |  55 +++++++-
 .../query/request/SearchFieldConfig.java      |  25 +++-
 .../query/request/SearchQueryBuilder.java     |  56 +++++++-
 .../metadata/ESTestConfiguration.java         |   7 +
 .../fixtures/ElasticSearchGoldenTest.java     |  15 +--
 .../fixtures/SampleDataFixtureTests.java      | 125 ++++++++++++++++++
 .../indexbuilder/MappingsBuilderTest.java     |  15 ++-
 .../query/request/SearchQueryBuilderTest.java |  53 +++++---
 .../request/SearchRequestHandlerTest.java     |  11 +-
 .../pegasus/com/linkedin/chart/ChartInfo.pdl  |   2 +-
 .../container/ContainerProperties.pdl         |   6 +-
 .../com/linkedin/dashboard/DashboardInfo.pdl  |   4 +-
 .../com/linkedin/datajob/DataFlowInfo.pdl     |   2 +-
 .../com/linkedin/datajob/DataJobInfo.pdl      |   2 +-
 .../dataplatform/DataPlatformInfo.pdl         |   4 +-
 .../DataPlatformInstanceProperties.pdl        |   2 +-
 .../DataProcessInstanceProperties.pdl         |   2 +-
 .../dataproduct/DataProductProperties.pdl     |   2 +-
 .../linkedin/dataset/DatasetProperties.pdl    |   6 +-
 .../com/linkedin/domain/DomainProperties.pdl  |   2 +-
 .../linkedin/glossary/GlossaryNodeInfo.pdl    |   4 +-
 .../linkedin/glossary/GlossaryTermInfo.pdl    |   4 +-
 .../identity/CorpUserEditableInfo.pdl         |   2 +-
 .../com/linkedin/identity/CorpUserInfo.pdl    |   4 +-
 .../linkedin/metadata/key/CorpGroupKey.pdl    |   4 +-
 .../com/linkedin/metadata/key/CorpUserKey.pdl |   2 +-
 .../com/linkedin/metadata/key/DataFlowKey.pdl |   4 +-
 .../com/linkedin/metadata/key/DataJobKey.pdl  |   2 +-
 .../linkedin/metadata/key/DataProcessKey.pdl  |   4 +-
 .../com/linkedin/metadata/key/DatasetKey.pdl  |   2 +-
 .../linkedin/metadata/key/GlossaryNodeKey.pdl |   4 +-
 .../linkedin/metadata/key/GlossaryTermKey.pdl |   4 +-
 .../linkedin/metadata/key/MLFeatureKey.pdl    |   4 +-
 .../metadata/key/MLFeatureTableKey.pdl        |   4 +-
 .../metadata/key/MLModelDeploymentKey.pdl     |   4 +-
 .../linkedin/metadata/key/MLModelGroupKey.pdl |   4 +-
 .../com/linkedin/metadata/key/MLModelKey.pdl  |   4 +-
 .../linkedin/metadata/key/MLPrimaryKeyKey.pdl |   4 +-
 .../com/linkedin/metadata/key/TagKey.pdl      |   4 +-
 .../com/linkedin/notebook/NotebookInfo.pdl    |   2 +-
 .../linkedin/ownership/OwnershipTypeInfo.pdl  |   4 +-
 .../com/linkedin/query/QueryProperties.pdl    |   4 +-
 .../com/linkedin/role/RoleProperties.pdl      |   2 +-
 .../com/linkedin/tag/TagProperties.pdl        |   2 +-
 .../config/search/SearchConfiguration.java    |   1 +
 .../config/search/WordGramConfiguration.java  |  11 ++
 .../src/main/resources/application.yml        |   6 +-
 .../com/datahub/test/TestEntityInfo.pdl       |   5 +
 53 files changed, 449 insertions(+), 108 deletions(-)
 create mode 100644 metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java

diff --git a/docs/advanced/no-code-modeling.md b/docs/advanced/no-code-modeling.md
index e1fadee6d371a..9c8f6761a62bc 100644
--- a/docs/advanced/no-code-modeling.md
+++ b/docs/advanced/no-code-modeling.md
@@ -211,7 +211,7 @@ record ServiceKey {
   * Name of the service
   */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true
   }
   name: string
diff --git a/docs/modeling/extending-the-metadata-model.md b/docs/modeling/extending-the-metadata-model.md
index 32951ab2e41eb..f47630f44e772 100644
--- a/docs/modeling/extending-the-metadata-model.md
+++ b/docs/modeling/extending-the-metadata-model.md
@@ -323,7 +323,7 @@ It takes the following parameters:
   annotations. To customize the set of analyzers used to index a certain field, you must add a new field type and define
   the set of mappings to be applied in the MappingsBuilder.
 
-  Thus far, we have implemented 10 fieldTypes:
+  Thus far, we have implemented 11 fieldTypes:
 
     1. *KEYWORD* - Short text fields that only support exact matches, often used only for filtering
 
@@ -332,20 +332,25 @@ It takes the following parameters:
     3. *TEXT_PARTIAL* - Text fields delimited by spaces/slashes/periods with partial matching support. Note, partial
        matching is expensive, so this field type should not be applied to fields with long values (like description)
 
-    4. *BROWSE_PATH* - Field type for browse paths. Applies specific mappings for slash delimited paths.
+    4. *WORD_GRAM* - Text fields delimited by spaces, slashes, periods, dashes, or underscores with partial matching AND 
+       word gram support. That is, the text will be split by the delimiters and can be matched with delimited queries
+       matching two, three, or four length tokens in addition to single tokens. As with partial match, this type is 
+       expensive, so should not be applied to fields with long values such as description.
 
-    5. *URN* - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like
+    5. *BROWSE_PATH* - Field type for browse paths. Applies specific mappings for slash delimited paths.
+
+    6. *URN* - Urn fields where each sub-component inside the urn is indexed. For instance, for a data platform urn like
        "urn:li:dataplatform:kafka", it will index the platform name "kafka" and ignore the common components
 
-    6. *URN_PARTIAL* - Urn fields where each sub-component inside the urn is indexed with partial matching support.
+    7. *URN_PARTIAL* - Urn fields where each sub-component inside the urn is indexed with partial matching support.
 
-    7. *BOOLEAN* - Boolean fields used for filtering.
+    8. *BOOLEAN* - Boolean fields used for filtering.
 
-    8. *COUNT* - Count fields used for filtering.
+    9. *COUNT* - Count fields used for filtering.
   
-    9. *DATETIME* - Datetime fields used to represent timestamps.
+    10. *DATETIME* - Datetime fields used to represent timestamps.
 
-    10. *OBJECT* - Each property in an object will become an extra column in Elasticsearch and can be referenced as 
+    11. *OBJECT* - Each property in an object will become an extra column in Elasticsearch and can be referenced as 
     `field.property` in queries. You should be careful to not use it on objects with many properties as it can cause a
     mapping explosion in Elasticsearch.
 
diff --git a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java
index f2e65c771c6eb..3d3fbcf3ccaa6 100644
--- a/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java
+++ b/entity-registry/src/main/java/com/linkedin/metadata/models/annotation/SearchableAnnotation.java
@@ -21,7 +21,7 @@ public class SearchableAnnotation {
 
   public static final String ANNOTATION_NAME = "Searchable";
   private static final Set<FieldType> DEFAULT_QUERY_FIELD_TYPES =
-      ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.URN, FieldType.URN_PARTIAL);
+      ImmutableSet.of(FieldType.TEXT, FieldType.TEXT_PARTIAL, FieldType.WORD_GRAM, FieldType.URN, FieldType.URN_PARTIAL);
 
   // Name of the field in the search index. Defaults to the field name in the schema
   String fieldName;
@@ -59,7 +59,8 @@ public enum FieldType {
     COUNT,
     DATETIME,
     OBJECT,
-    BROWSE_PATH_V2
+    BROWSE_PATH_V2,
+    WORD_GRAM
   }
 
   @Nonnull
diff --git a/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java b/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java
index 1ab5ff640ce32..3618108970afa 100644
--- a/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java
+++ b/entity-registry/src/test/java/com/linkedin/metadata/models/EntitySpecBuilderTest.java
@@ -142,7 +142,7 @@ private void validateTestEntityInfo(final AspectSpec testEntityInfo) {
     assertEquals(new TestEntityInfo().schema().getFullName(), testEntityInfo.getPegasusSchema().getFullName());
 
     // Assert on Searchable Fields
-    assertEquals(9, testEntityInfo.getSearchableFieldSpecs().size());
+    assertEquals(testEntityInfo.getSearchableFieldSpecs().size(), 10);
     assertEquals("customProperties", testEntityInfo.getSearchableFieldSpecMap().get(
         new PathSpec("customProperties").toString()).getSearchableAnnotation().getFieldName());
     assertEquals(SearchableAnnotation.FieldType.KEYWORD, testEntityInfo.getSearchableFieldSpecMap().get(
@@ -158,6 +158,11 @@ private void validateTestEntityInfo(final AspectSpec testEntityInfo) {
     assertEquals(SearchableAnnotation.FieldType.TEXT_PARTIAL, testEntityInfo.getSearchableFieldSpecMap().get(
         new PathSpec("textArrayField", "*").toString())
         .getSearchableAnnotation().getFieldType());
+    assertEquals("wordGramField", testEntityInfo.getSearchableFieldSpecMap().get(
+        new PathSpec("wordGramField").toString()).getSearchableAnnotation().getFieldName());
+    assertEquals(SearchableAnnotation.FieldType.WORD_GRAM, testEntityInfo.getSearchableFieldSpecMap().get(
+            new PathSpec("wordGramField").toString())
+        .getSearchableAnnotation().getFieldType());
     assertEquals("nestedIntegerField", testEntityInfo.getSearchableFieldSpecMap().get(
         new PathSpec("nestedRecordField", "nestedIntegerField").toString()).getSearchableAnnotation().getFieldName());
     assertEquals(SearchableAnnotation.FieldType.COUNT, testEntityInfo.getSearchableFieldSpecMap().get(
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java
index 555acb2ffdd3b..efa4e0c279a76 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilder.java
@@ -42,6 +42,9 @@ public static Map<String, String> getPartialNgramConfigWithOverrides(Map<String,
   // Subfields
   public static final String DELIMITED = "delimited";
   public static final String LENGTH = "length";
+  public static final String WORD_GRAMS_LENGTH_2 = "wordGrams2";
+  public static final String WORD_GRAMS_LENGTH_3 = "wordGrams3";
+  public static final String WORD_GRAMS_LENGTH_4 = "wordGrams4";
 
   private MappingsBuilder() {
   }
@@ -94,16 +97,30 @@ private static Map<String, Object> getMappingsForField(@Nonnull final Searchable
       mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER);
       // Add keyword subfield without lowercase filter
       mappingForField.put(FIELDS, ImmutableMap.of(KEYWORD, KEYWORD_TYPE_MAP));
-    } else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL) {
+    } else if (fieldType == FieldType.TEXT || fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) {
       mappingForField.put(TYPE, KEYWORD);
       mappingForField.put(NORMALIZER, KEYWORD_NORMALIZER);
       Map<String, Object> subFields = new HashMap<>();
-      if (fieldType == FieldType.TEXT_PARTIAL) {
+      if (fieldType == FieldType.TEXT_PARTIAL || fieldType == FieldType.WORD_GRAM) {
         subFields.put(NGRAM, getPartialNgramConfigWithOverrides(
                 ImmutableMap.of(
                         ANALYZER, PARTIAL_ANALYZER
                 )
         ));
+        if (fieldType == FieldType.WORD_GRAM) {
+          for (Map.Entry<String, String> entry : Map.of(
+              WORD_GRAMS_LENGTH_2, WORD_GRAM_2_ANALYZER,
+              WORD_GRAMS_LENGTH_3, WORD_GRAM_3_ANALYZER,
+              WORD_GRAMS_LENGTH_4, WORD_GRAM_4_ANALYZER).entrySet()) {
+            String fieldName = entry.getKey();
+            String analyzerName = entry.getValue();
+            subFields.put(fieldName, ImmutableMap.of(
+                TYPE, TEXT,
+                ANALYZER, analyzerName,
+                SEARCH_ANALYZER, analyzerName
+            ));
+          }
+        }
       }
       subFields.put(DELIMITED, ImmutableMap.of(
               TYPE, TEXT,
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java
index 5b3e396837aa7..e180c8296b48d 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/SettingsBuilder.java
@@ -66,6 +66,9 @@ public class SettingsBuilder {
   public static final String KEYWORD_ANALYZER = "keyword";
   public static final String URN_ANALYZER = "urn_component";
   public static final String URN_SEARCH_ANALYZER = "query_urn_component";
+  public static final String WORD_GRAM_2_ANALYZER = "word_gram_2";
+  public static final String WORD_GRAM_3_ANALYZER = "word_gram_3";
+  public static final String WORD_GRAM_4_ANALYZER = "word_gram_4";
 
   // Filters
   public static final String ALPHANUM_SPACE_ONLY = "alpha_num_space";
@@ -80,6 +83,10 @@ public class SettingsBuilder {
   public static final String MULTIFILTER = "multifilter";
   public static final String MULTIFILTER_GRAPH = "multifilter_graph";
   public static final String PARTIAL_URN_COMPONENT = "partial_urn_component";
+  public static final String SHINGLE = "shingle";
+  public static final String WORD_GRAM_2_FILTER = "word_gram_2_filter";
+  public static final String WORD_GRAM_3_FILTER = "word_gram_3_filter";
+  public static final String WORD_GRAM_4_FILTER = "word_gram_4_filter";
   public static final String SNOWBALL = "snowball";
   public static final String STEM_OVERRIDE = "stem_override";
   public static final String STOP = "stop";
@@ -108,6 +115,7 @@ public class SettingsBuilder {
   public static final String SLASH_TOKENIZER = "slash_tokenizer";
   public static final String UNIT_SEPARATOR_PATH_TOKENIZER = "unit_separator_path_tokenizer";
   public static final String UNIT_SEPARATOR_TOKENIZER = "unit_separator_tokenizer";
+  public static final String WORD_GRAM_TOKENIZER = "word_gram_tokenizer";
   // Do not remove the space, needed for multi-term synonyms
   public static final List<String> ALPHANUM_SPACE_PATTERNS = ImmutableList.of(
           "([a-z0-9 _-]{2,})",
@@ -161,6 +169,13 @@ public class SettingsBuilder {
           AUTOCOMPLETE_CUSTOM_DELIMITER,
           LOWERCASE);
 
+  public static final List<String> WORD_GRAM_TOKEN_FILTERS = ImmutableList.of(
+      ASCII_FOLDING,
+      LOWERCASE,
+      TRIM,
+      REMOVE_QUOTES
+  );
+
   public final Map<String, Object> settings;
 
   public SettingsBuilder(String mainTokenizer) {
@@ -275,6 +290,17 @@ private static Map<String, Object> buildFilters() throws IOException {
                         .collect(Collectors.toList()))
                 .build());
       }
+
+      for (Map.Entry<String, Integer> entry : Map.of(WORD_GRAM_2_FILTER, 2, WORD_GRAM_3_FILTER, 3, WORD_GRAM_4_FILTER, 4).entrySet()) {
+        String filterName = entry.getKey();
+        Integer gramSize = entry.getValue();
+        filters.put(filterName, ImmutableMap.<String, Object>builder()
+            .put(TYPE, SHINGLE)
+            .put("min_shingle_size", gramSize)
+            .put("max_shingle_size", gramSize)
+            .put("output_unigrams", false)
+            .build());
+      }
     }
 
     return filters.build();
@@ -302,13 +328,24 @@ private static Map<String, Object> buildTokenizers() {
             .put(DELIMITER, "␟")
             .build());
 
-    // Tokenize by whitespace and most special chars
+    // Tokenize by most special chars
+    // Do NOT tokenize by whitespace to keep multi-word synonyms in the same token
+    // The split by whitespace is done later in the token filters phase
     tokenizers.put(MAIN_TOKENIZER,
             ImmutableMap.<String, Object>builder()
                     .put(TYPE, PATTERN)
                     .put(PATTERN, "[(),./:]")
                     .build());
 
+    // Tokenize by whitespace and most special chars for wordgrams
+    // only split on - when not preceded by a whitespace to preserve exclusion functionality
+    // i.e. "logging-events-bkcp" and "logging-events -bckp" should be handled differently
+    tokenizers.put(WORD_GRAM_TOKENIZER,
+        ImmutableMap.<String, Object>builder()
+            .put(TYPE, PATTERN)
+            .put(PATTERN, "[(),./:\\s_]|(?<=\\S)(-)")
+            .build());
+
     return tokenizers.build();
   }
 
@@ -382,6 +419,21 @@ private static Map<String, Object> buildAnalyzers(String mainTokenizer) {
             .put(FILTER, SEARCH_TOKEN_FILTERS)
             .build());
 
+    // Support word grams
+    for (Map.Entry<String, String> entry : Map.of(
+        WORD_GRAM_2_ANALYZER, WORD_GRAM_2_FILTER,
+        WORD_GRAM_3_ANALYZER, WORD_GRAM_3_FILTER,
+        WORD_GRAM_4_ANALYZER, WORD_GRAM_4_FILTER).entrySet()) {
+      String analyzerName = entry.getKey();
+      String filterName = entry.getValue();
+      analyzers.put(analyzerName, ImmutableMap.<String, Object>builder()
+          .put(TOKENIZER, WORD_GRAM_TOKENIZER)
+          .put(FILTER, ImmutableList.<Object>builder()
+              .addAll(WORD_GRAM_TOKEN_FILTERS)
+              .add(filterName).build())
+          .build());
+    }
+
     // For special analysis, the substitution can be read from the configuration (chinese tokenizer: ik_smart / smartCN)
     // Analyzer for partial matching (i.e. autocomplete) - Prefix matching of each token
     analyzers.put(PARTIAL_ANALYZER, ImmutableMap.<String, Object>builder()
@@ -395,6 +447,7 @@ private static Map<String, Object> buildAnalyzers(String mainTokenizer) {
             .put(FILTER, PARTIAL_AUTOCOMPLETE_TOKEN_FILTERS)
             .build());
 
+
     return analyzers.build();
   }
 }
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java
index fb7e19a5d67bc..a75ed40ffca52 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchFieldConfig.java
@@ -11,11 +11,8 @@
 
 import java.util.Set;
 
-import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_HIERARCHY_ANALYZER;
-import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.BROWSE_PATH_V2_HIERARCHY_ANALYZER;
-import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.KEYWORD_ANALYZER;
-import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.TEXT_SEARCH_ANALYZER;
-import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.URN_SEARCH_ANALYZER;
+import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.*;
+
 
 @Builder
 @Getter
@@ -33,7 +30,8 @@ public class SearchFieldConfig {
     private static final Set<SearchableAnnotation.FieldType> TYPES_WITH_DELIMITED_SUBFIELD =
             Set.of(
                     SearchableAnnotation.FieldType.TEXT,
-                    SearchableAnnotation.FieldType.TEXT_PARTIAL
+                    SearchableAnnotation.FieldType.TEXT_PARTIAL,
+                    SearchableAnnotation.FieldType.WORD_GRAM
                     // NOT URN_PARTIAL (urn field is special)
             );
     // NOT comprehensive
@@ -56,6 +54,7 @@ public class SearchFieldConfig {
                     SearchableAnnotation.FieldType.TEXT,
                     SearchableAnnotation.FieldType.TEXT_PARTIAL,
                     SearchableAnnotation.FieldType.KEYWORD,
+                    SearchableAnnotation.FieldType.WORD_GRAM,
                     // not analyzed
                     SearchableAnnotation.FieldType.BOOLEAN,
                     SearchableAnnotation.FieldType.COUNT,
@@ -69,6 +68,11 @@ public class SearchFieldConfig {
                     SearchableAnnotation.FieldType.URN_PARTIAL
             );
 
+    public static final Set<SearchableAnnotation.FieldType> TYPES_WITH_WORD_GRAM =
+        Set.of(
+            SearchableAnnotation.FieldType.WORD_GRAM
+        );
+
     @Nonnull
     private final String fieldName;
     @Nonnull
@@ -78,9 +82,11 @@ public class SearchFieldConfig {
     private final String analyzer;
     private boolean hasKeywordSubfield;
     private boolean hasDelimitedSubfield;
+    private boolean hasWordGramSubfields;
     private boolean isQueryByDefault;
     private boolean isDelimitedSubfield;
     private boolean isKeywordSubfield;
+    private boolean isWordGramSubfield;
 
     public static SearchFieldConfig detectSubFieldType(@Nonnull SearchableFieldSpec fieldSpec) {
         final SearchableAnnotation searchableAnnotation = fieldSpec.getSearchableAnnotation();
@@ -106,6 +112,7 @@ public static SearchFieldConfig detectSubFieldType(String fieldName,
                 .analyzer(getAnalyzer(fieldName, fieldType))
                 .hasKeywordSubfield(hasKeywordSubfield(fieldName, fieldType))
                 .hasDelimitedSubfield(hasDelimitedSubfield(fieldName, fieldType))
+                .hasWordGramSubfields(hasWordGramSubfields(fieldName, fieldType))
                 .isQueryByDefault(isQueryByDefault)
                 .build();
     }
@@ -118,6 +125,11 @@ private static boolean hasDelimitedSubfield(String fieldName, SearchableAnnotati
         return !fieldName.contains(".")
                 && ("urn".equals(fieldName) || TYPES_WITH_DELIMITED_SUBFIELD.contains(fieldType));
     }
+
+    private static boolean hasWordGramSubfields(String fieldName, SearchableAnnotation.FieldType fieldType) {
+        return !fieldName.contains(".")
+            && (TYPES_WITH_WORD_GRAM.contains(fieldType));
+    }
     private static boolean hasKeywordSubfield(String fieldName, SearchableAnnotation.FieldType fieldType) {
         return !"urn".equals(fieldName)
                 && !fieldName.contains(".")
@@ -155,6 +167,7 @@ public SearchFieldConfigBuilder fieldName(@Nonnull String fieldName) {
             this.fieldName = fieldName;
             isDelimitedSubfield(fieldName.endsWith(".delimited"));
             isKeywordSubfield(fieldName.endsWith(".keyword"));
+            isWordGramSubfield(fieldName.contains("wordGrams"));
             shortName(fieldName.split("[.]")[0]);
             return this;
         }
diff --git a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java
index 289c6f1f84e32..49fc882314e0a 100644
--- a/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java
+++ b/metadata-io/src/main/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilder.java
@@ -3,6 +3,7 @@
 import com.linkedin.metadata.config.search.ExactMatchConfiguration;
 import com.linkedin.metadata.config.search.PartialConfiguration;
 import com.linkedin.metadata.config.search.SearchConfiguration;
+import com.linkedin.metadata.config.search.WordGramConfiguration;
 import com.linkedin.metadata.config.search.custom.BoolQueryConfiguration;
 import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration;
 import com.linkedin.metadata.config.search.custom.QueryConfiguration;
@@ -51,6 +52,9 @@
 import org.elasticsearch.search.SearchModule;
 
 import static com.linkedin.metadata.models.SearchableFieldSpecExtractor.PRIMARY_URN_SEARCH_PROPERTIES;
+import static com.linkedin.metadata.search.elasticsearch.indexbuilder.SettingsBuilder.*;
+import static com.linkedin.metadata.search.elasticsearch.query.request.SearchFieldConfig.*;
+
 
 @Slf4j
 public class SearchQueryBuilder {
@@ -69,6 +73,7 @@ public class SearchQueryBuilder {
   public static final String STRUCTURED_QUERY_PREFIX = "\\\\/q ";
   private final ExactMatchConfiguration exactMatchConfiguration;
   private final PartialConfiguration partialConfiguration;
+  private final WordGramConfiguration wordGramConfiguration;
 
   private final CustomizedQueryHandler customizedQueryHandler;
 
@@ -76,6 +81,7 @@ public SearchQueryBuilder(@Nonnull SearchConfiguration searchConfiguration,
                             @Nullable CustomSearchConfiguration customSearchConfiguration) {
     this.exactMatchConfiguration = searchConfiguration.getExactMatch();
     this.partialConfiguration = searchConfiguration.getPartial();
+    this.wordGramConfiguration = searchConfiguration.getWordGram();
     this.customizedQueryHandler = CustomizedQueryHandler.builder(customSearchConfiguration).build();
   }
 
@@ -148,6 +154,36 @@ private Set<SearchFieldConfig> getStandardFields(@Nonnull EntitySpec entitySpec)
         fields.add(SearchFieldConfig.detectSubFieldType(searchFieldConfig.fieldName() + ".delimited",
                 searchFieldConfig.boost() * partialConfiguration.getFactor(),
                 searchableAnnotation.getFieldType(), searchableAnnotation.isQueryByDefault()));
+
+        if (SearchFieldConfig.detectSubFieldType(fieldSpec).hasWordGramSubfields()) {
+          fields.add(SearchFieldConfig.builder()
+              .fieldName(searchFieldConfig.fieldName() + ".wordGrams2")
+              .boost(searchFieldConfig.boost() * wordGramConfiguration.getTwoGramFactor())
+              .analyzer(WORD_GRAM_2_ANALYZER)
+              .hasKeywordSubfield(true)
+              .hasDelimitedSubfield(true)
+              .hasWordGramSubfields(true)
+              .isQueryByDefault(true)
+              .build());
+          fields.add(SearchFieldConfig.builder()
+              .fieldName(searchFieldConfig.fieldName() + ".wordGrams3")
+              .boost(searchFieldConfig.boost() * wordGramConfiguration.getThreeGramFactor())
+              .analyzer(WORD_GRAM_3_ANALYZER)
+              .hasKeywordSubfield(true)
+              .hasDelimitedSubfield(true)
+              .hasWordGramSubfields(true)
+              .isQueryByDefault(true)
+              .build());
+          fields.add(SearchFieldConfig.builder()
+              .fieldName(searchFieldConfig.fieldName() + ".wordGrams4")
+              .boost(searchFieldConfig.boost() * wordGramConfiguration.getFourGramFactor())
+              .analyzer(WORD_GRAM_4_ANALYZER)
+              .hasKeywordSubfield(true)
+              .hasDelimitedSubfield(true)
+              .hasWordGramSubfields(true)
+              .isQueryByDefault(true)
+              .build());
+        }
       }
     }
 
@@ -188,7 +224,7 @@ private Optional<QueryBuilder> getSimpleQuery(@Nullable QueryConfiguration custo
               .filter(SearchFieldConfig::isQueryByDefault)
               .collect(Collectors.groupingBy(SearchFieldConfig::analyzer));
 
-      analyzerGroup.keySet().stream().sorted().forEach(analyzer -> {
+      analyzerGroup.keySet().stream().sorted().filter(str -> !str.contains("word_gram")).forEach(analyzer -> {
         List<SearchFieldConfig> fieldConfigs = analyzerGroup.get(analyzer);
         SimpleQueryStringBuilder simpleBuilder = QueryBuilders.simpleQueryStringQuery(sanitizedQuery);
         simpleBuilder.analyzer(analyzer);
@@ -253,6 +289,13 @@ private Optional<QueryBuilder> getPrefixAndExactMatchQuery(@Nullable QueryConfig
                                 * exactMatchConfiguration.getCaseSensitivityFactor())
                         .queryName(searchFieldConfig.fieldName()));
               }
+
+              if (searchFieldConfig.isWordGramSubfield() && isPrefixQuery) {
+                finalQuery.should(QueryBuilders
+                    .matchPhraseQuery(ESUtils.toKeywordField(searchFieldConfig.fieldName(), false), unquotedQuery)
+                    .boost(searchFieldConfig.boost() * getWordGramFactor(searchFieldConfig.fieldName()))
+                    .queryName(searchFieldConfig.shortName()));
+              }
             });
 
     return finalQuery.should().size() > 0 ? Optional.of(finalQuery) : Optional.empty();
@@ -377,4 +420,15 @@ private FunctionScoreQueryBuilder toFunctionScoreQueryBuilder(QueryBuilder query
       throw new RuntimeException(e);
     }
   }
+
+  public float getWordGramFactor(String fieldName) {
+    if (fieldName.endsWith("Grams2")) {
+      return wordGramConfiguration.getTwoGramFactor();
+    } else if (fieldName.endsWith("Grams3")) {
+      return wordGramConfiguration.getThreeGramFactor();
+    } else if (fieldName.endsWith("Grams4")) {
+      return wordGramConfiguration.getFourGramFactor();
+    }
+    throw new IllegalArgumentException(fieldName + " does not end with Grams[2-4]");
+  }
 }
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java b/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java
index 1e5b860b581fc..673474c96cc51 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/ESTestConfiguration.java
@@ -6,6 +6,7 @@
 import com.linkedin.metadata.config.search.ExactMatchConfiguration;
 import com.linkedin.metadata.config.search.PartialConfiguration;
 import com.linkedin.metadata.config.search.SearchConfiguration;
+import com.linkedin.metadata.config.search.WordGramConfiguration;
 import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration;
 import com.linkedin.metadata.models.registry.ConfigEntityRegistry;
 import com.linkedin.metadata.models.registry.EntityRegistry;
@@ -55,11 +56,17 @@ public SearchConfiguration searchConfiguration() {
         exactMatchConfiguration.setCaseSensitivityFactor(0.7f);
         exactMatchConfiguration.setEnableStructured(true);
 
+        WordGramConfiguration wordGramConfiguration = new WordGramConfiguration();
+        wordGramConfiguration.setTwoGramFactor(1.2f);
+        wordGramConfiguration.setThreeGramFactor(1.5f);
+        wordGramConfiguration.setFourGramFactor(1.8f);
+
         PartialConfiguration partialConfiguration = new PartialConfiguration();
         partialConfiguration.setFactor(0.4f);
         partialConfiguration.setUrnFactor(0.5f);
 
         searchConfiguration.setExactMatch(exactMatchConfiguration);
+        searchConfiguration.setWordGram(wordGramConfiguration);
         searchConfiguration.setPartial(partialConfiguration);
         return searchConfiguration;
     }
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java
index cc0d9dca6ae5f..29457f244291f 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java
@@ -116,15 +116,7 @@ public void testGlossaryTerms() {
         assertTrue(fourthResultMatchedFields.toString().contains("ReturnRate"));
     }
 
-    /**
-     *
-     * The test below should be added back in as improvements are made to search,
-     * via the linked tickets.
-     *
-     **/
-
-    // TODO: enable once PFP-481 is complete
-    @Test(enabled = false)
+    @Test
     public void testNameMatchPartiallyQualified() {
         /*
           Searching for "analytics.pet_details" (partially qualified) should return the fully qualified table
@@ -140,4 +132,9 @@ public void testNameMatchPartiallyQualified() {
         assertTrue(secondResultUrn.toString().contains("dbt,long_tail_companions.analytics.pet_details"));
     }
 
+    /*
+     * Tests that should pass but do not yet can be added below here, with the following annotation:
+     * @Test(enabled = false)
+     **/
+
 }
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java
index 2f1e48c18450d..d989d4ef4fa87 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/SampleDataFixtureTests.java
@@ -358,6 +358,84 @@ public void testDelimitedSynonym() throws IOException {
         }).collect(Collectors.toList());
     }
 
+    @Test
+    public void testNegateAnalysis() throws IOException {
+        String queryWithMinus = "logging_events -bckp";
+        AnalyzeRequest request = AnalyzeRequest.withIndexAnalyzer(
+            "smpldat_datasetindex_v2",
+            "query_word_delimited", queryWithMinus
+        );
+        assertEquals(getTokens(request)
+            .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()),
+            List.of("logging_events -bckp", "logging_ev", "-bckp", "log", "event", "bckp"));
+
+        request = AnalyzeRequest.withIndexAnalyzer(
+            "smpldat_datasetindex_v2",
+            "word_gram_3", queryWithMinus
+        );
+        assertEquals(getTokens(request)
+            .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("logging events -bckp"));
+
+        request = AnalyzeRequest.withIndexAnalyzer(
+            "smpldat_datasetindex_v2",
+            "word_gram_4", queryWithMinus
+        );
+        assertEquals(getTokens(request)
+            .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of());
+
+    }
+
+    @Test
+    public void testWordGram() throws IOException {
+        String text = "hello.cat_cool_customer";
+        AnalyzeRequest request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", text);
+        assertEquals(getTokens(request)
+            .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat", "cat cool", "cool customer"));
+        request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", text);
+        assertEquals(getTokens(request)
+            .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat cool", "cat cool customer"));
+        request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", text);
+        assertEquals(getTokens(request)
+            .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hello cat cool customer"));
+
+        String testMoreSeparators = "quick.brown:fox jumped-LAZY_Dog";
+        request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", testMoreSeparators);
+        assertEquals(getTokens(request)
+            .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()),
+            List.of("quick brown", "brown fox", "fox jumped", "jumped lazy", "lazy dog"));
+        request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", testMoreSeparators);
+        assertEquals(getTokens(request)
+            .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()),
+            List.of("quick brown fox", "brown fox jumped", "fox jumped lazy", "jumped lazy dog"));
+        request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", testMoreSeparators);
+        assertEquals(getTokens(request)
+            .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()),
+            List.of("quick brown fox jumped", "brown fox jumped lazy", "fox jumped lazy dog"));
+
+        String textWithQuotesAndDuplicateWord = "\"my_db.my_exact_table\"";
+        request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", textWithQuotesAndDuplicateWord);
+        assertEquals(getTokens(request)
+            .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db", "db my", "my exact", "exact table"));
+        request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_3", textWithQuotesAndDuplicateWord);
+        assertEquals(getTokens(request)
+            .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db my", "db my exact", "my exact table"));
+        request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_4", textWithQuotesAndDuplicateWord);
+        assertEquals(getTokens(request)
+            .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("my db my exact", "db my exact table"));
+
+        String textWithParens = "(hi) there";
+        request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", "word_gram_2", textWithParens);
+        assertEquals(getTokens(request)
+            .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of("hi there"));
+
+        String oneWordText = "hello";
+        for (String analyzer : List.of("word_gram_2", "word_gram_3", "word_gram_4")) {
+            request = AnalyzeRequest.withIndexAnalyzer("smpldat_datasetindex_v2", analyzer, oneWordText);
+            assertEquals(getTokens(request)
+                .map(AnalyzeResponse.AnalyzeToken::getTerm).collect(Collectors.toList()), List.of());
+        }
+    }
+
     @Test
     public void testUrnSynonym() throws IOException {
         List<String> expectedTokens = List.of("bigquery");
@@ -1267,6 +1345,53 @@ public void testParens() {
                 String.format("%s - Expected search results to include matched fields", query));
         assertEquals(result.getEntities().size(), 2);
     }
+    @Test
+    public void testGram() {
+        String query = "jaffle shop customers";
+        SearchResult result = searchAcrossEntities(searchService, query);
+        assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
+            String.format("%s - Expected search results", query));
+
+        assertEquals(result.getEntities().get(0).getEntity().toString(),
+            "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers,PROD)",
+            "Expected exact match in 1st position");
+
+        query = "shop customers source";
+        result = searchAcrossEntities(searchService, query);
+        assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
+            String.format("%s - Expected search results", query));
+
+        assertEquals(result.getEntities().get(0).getEntity().toString(),
+            "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.customers_source,PROD)",
+            "Expected ngram match in 1st position");
+
+        query = "jaffle shop stg customers";
+        result = searchAcrossEntities(searchService, query);
+        assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
+            String.format("%s - Expected search results", query));
+
+        assertEquals(result.getEntities().get(0).getEntity().toString(),
+            "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.stg_customers,PROD)",
+            "Expected ngram match in 1st position");
+
+        query = "jaffle shop transformers customers";
+        result = searchAcrossEntities(searchService, query);
+        assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
+            String.format("%s - Expected search results", query));
+
+        assertEquals(result.getEntities().get(0).getEntity().toString(),
+            "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.transformers_customers,PROD)",
+            "Expected ngram match in 1st position");
+
+        query = "shop raw customers";
+        result = searchAcrossEntities(searchService, query);
+        assertTrue(result.hasEntities() && !result.getEntities().isEmpty(),
+            String.format("%s - Expected search results", query));
+
+        assertEquals(result.getEntities().get(0).getEntity().toString(),
+            "urn:li:dataset:(urn:li:dataPlatform:dbt,cypress_project.jaffle_shop.raw_customers,PROD)",
+            "Expected ngram match in 1st position");
+    }
 
     @Test
     public void testPrefixVsExact() {
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java
index ed72b46e98c46..5a8f80f325dbd 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/indexbuilder/MappingsBuilderTest.java
@@ -16,7 +16,7 @@ public void testMappingsBuilder() {
     Map<String, Object> result = MappingsBuilder.getMappings(TestEntitySpecBuilder.getSpec());
     assertEquals(result.size(), 1);
     Map<String, Object> properties = (Map<String, Object>) result.get("properties");
-    assertEquals(properties.size(), 17);
+    assertEquals(properties.size(), 18);
     assertEquals(properties.get("urn"), ImmutableMap.of("type", "keyword",
             "fields",
             ImmutableMap.of("delimited",
@@ -76,6 +76,19 @@ public void testMappingsBuilder() {
     assertTrue(textArrayFieldSubfields.containsKey("ngram"));
     assertTrue(textArrayFieldSubfields.containsKey("keyword"));
 
+    // WORD_GRAM
+    Map<String, Object> wordGramField = (Map<String, Object>) properties.get("wordGramField");
+    assertEquals(wordGramField.get("type"), "keyword");
+    assertEquals(wordGramField.get("normalizer"), "keyword_normalizer");
+    Map<String, Object> wordGramFieldSubfields = (Map<String, Object>) wordGramField.get("fields");
+    assertEquals(wordGramFieldSubfields.size(), 6);
+    assertTrue(wordGramFieldSubfields.containsKey("delimited"));
+    assertTrue(wordGramFieldSubfields.containsKey("ngram"));
+    assertTrue(wordGramFieldSubfields.containsKey("keyword"));
+    assertTrue(wordGramFieldSubfields.containsKey("wordGrams2"));
+    assertTrue(wordGramFieldSubfields.containsKey("wordGrams3"));
+    assertTrue(wordGramFieldSubfields.containsKey("wordGrams4"));
+
     // URN
     Map<String, Object> foreignKey = (Map<String, Object>) properties.get("foreignKey");
     assertEquals(foreignKey.get("type"), "text");
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java
index a2ec396c34b2d..282b1d8bb6778 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchQueryBuilderTest.java
@@ -4,6 +4,7 @@
 import com.linkedin.metadata.config.search.ExactMatchConfiguration;
 import com.linkedin.metadata.config.search.PartialConfiguration;
 import com.linkedin.metadata.config.search.SearchConfiguration;
+import com.linkedin.metadata.config.search.WordGramConfiguration;
 import com.linkedin.metadata.config.search.custom.CustomSearchConfiguration;
 import com.fasterxml.jackson.dataformat.yaml.YAMLMapper;
 import com.google.common.collect.ImmutableList;
@@ -18,6 +19,7 @@
 import org.elasticsearch.index.query.BoolQueryBuilder;
 import org.elasticsearch.index.query.MatchAllQueryBuilder;
 import org.elasticsearch.index.query.MatchPhrasePrefixQueryBuilder;
+import org.elasticsearch.index.query.MatchPhraseQueryBuilder;
 import org.elasticsearch.index.query.QueryBuilder;
 import org.elasticsearch.index.query.QueryStringQueryBuilder;
 import org.elasticsearch.index.query.SimpleQueryStringBuilder;
@@ -46,11 +48,17 @@ public class SearchQueryBuilderTest {
     exactMatchConfiguration.setCaseSensitivityFactor(0.7f);
     exactMatchConfiguration.setEnableStructured(true);
 
+    WordGramConfiguration wordGramConfiguration = new WordGramConfiguration();
+    wordGramConfiguration.setTwoGramFactor(1.2f);
+    wordGramConfiguration.setThreeGramFactor(1.5f);
+    wordGramConfiguration.setFourGramFactor(1.8f);
+
     PartialConfiguration partialConfiguration = new PartialConfiguration();
     partialConfiguration.setFactor(0.4f);
     partialConfiguration.setUrnFactor(0.7f);
 
     testQueryConfig.setExactMatch(exactMatchConfiguration);
+    testQueryConfig.setWordGram(wordGramConfiguration);
     testQueryConfig.setPartial(partialConfiguration);
   }
   public static final SearchQueryBuilder TEST_BUILDER = new SearchQueryBuilder(testQueryConfig, null);
@@ -70,16 +78,17 @@ public void testQueryBuilderFulltext() {
     assertEquals(keywordQuery.value(), "testQuery");
     assertEquals(keywordQuery.analyzer(), "keyword");
     Map<String, Float> keywordFields = keywordQuery.fields();
-    assertEquals(keywordFields.size(), 8);
+    assertEquals(keywordFields.size(), 9);
     assertEquals(keywordFields, Map.of(
-       "urn", 10.f,
-       "textArrayField", 1.0f,
-       "customProperties", 1.0f,
-       "nestedArrayArrayField", 1.0f,
-       "textFieldOverride", 1.0f,
-       "nestedArrayStringField", 1.0f,
-       "keyPart1", 10.0f,
-       "esObjectField", 1.0f
+        "urn", 10.f,
+        "textArrayField", 1.0f,
+        "customProperties", 1.0f,
+        "wordGramField", 1.0f,
+        "nestedArrayArrayField", 1.0f,
+        "textFieldOverride", 1.0f,
+        "nestedArrayStringField", 1.0f,
+        "keyPart1", 10.0f,
+        "esObjectField", 1.0f
     ));
 
     SimpleQueryStringBuilder urnComponentQuery = (SimpleQueryStringBuilder) analyzerGroupQuery.should().get(1);
@@ -99,7 +108,8 @@ public void testQueryBuilderFulltext() {
             "nestedArrayArrayField.delimited", 0.4f,
             "urn.delimited", 7.0f,
             "textArrayField.delimited", 0.4f,
-            "nestedArrayStringField.delimited", 0.4f
+            "nestedArrayStringField.delimited", 0.4f,
+            "wordGramField.delimited", 0.4f
     ));
 
     BoolQueryBuilder boolPrefixQuery = (BoolQueryBuilder) shouldQueries.get(1);
@@ -109,21 +119,30 @@ public void testQueryBuilderFulltext() {
       if (prefixQuery instanceof MatchPhrasePrefixQueryBuilder) {
         MatchPhrasePrefixQueryBuilder builder = (MatchPhrasePrefixQueryBuilder) prefixQuery;
         return Pair.of(builder.fieldName(), builder.boost());
-      } else {
+      } else if (prefixQuery instanceof TermQueryBuilder) {
         // exact
         TermQueryBuilder builder = (TermQueryBuilder) prefixQuery;
         return Pair.of(builder.fieldName(), builder.boost());
+      } else { // if (prefixQuery instanceof MatchPhraseQueryBuilder) {
+        // ngram
+        MatchPhraseQueryBuilder builder = (MatchPhraseQueryBuilder) prefixQuery;
+        return Pair.of(builder.fieldName(), builder.boost());
       }
     }).collect(Collectors.toList());
 
-    assertEquals(prefixFieldWeights.size(), 22);
+    assertEquals(prefixFieldWeights.size(), 28);
 
     List.of(
             Pair.of("urn", 100.0f),
             Pair.of("urn", 70.0f),
             Pair.of("keyPart1.delimited", 16.8f),
             Pair.of("keyPart1.keyword", 100.0f),
-            Pair.of("keyPart1.keyword", 70.0f)
+            Pair.of("keyPart1.keyword", 70.0f),
+            Pair.of("wordGramField.wordGrams2", 1.44f),
+            Pair.of("wordGramField.wordGrams3", 2.25f),
+            Pair.of("wordGramField.wordGrams4", 3.2399998f),
+            Pair.of("wordGramField.keyword", 10.0f),
+            Pair.of("wordGramField.keyword", 7.0f)
     ).forEach(p -> assertTrue(prefixFieldWeights.contains(p), "Missing: " + p));
 
     // Validate scorer
@@ -144,7 +163,7 @@ public void testQueryBuilderStructured() {
     assertEquals(keywordQuery.queryString(), "testQuery");
     assertNull(keywordQuery.analyzer());
     Map<String, Float> keywordFields = keywordQuery.fields();
-    assertEquals(keywordFields.size(), 16);
+    assertEquals(keywordFields.size(), 21);
     assertEquals(keywordFields.get("keyPart1").floatValue(), 10.0f);
     assertFalse(keywordFields.containsKey("keyPart3"));
     assertEquals(keywordFields.get("textFieldOverride").floatValue(), 1.0f);
@@ -196,10 +215,14 @@ public void testCustomExactMatch() {
 
       List<QueryBuilder> queries = boolPrefixQuery.should().stream().map(prefixQuery -> {
         if (prefixQuery instanceof MatchPhrasePrefixQueryBuilder) {
+          // prefix
           return (MatchPhrasePrefixQueryBuilder) prefixQuery;
-        } else {
+        } else if (prefixQuery instanceof TermQueryBuilder) {
           // exact
           return (TermQueryBuilder) prefixQuery;
+        } else { // if (prefixQuery instanceof MatchPhraseQueryBuilder) {
+          // ngram
+          return (MatchPhraseQueryBuilder) prefixQuery;
         }
       }).collect(Collectors.toList());
 
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java
index d66d6a0ab0e76..db56e2d34881b 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/query/request/SearchRequestHandlerTest.java
@@ -7,6 +7,7 @@
 import com.linkedin.data.template.StringArray;
 import com.linkedin.metadata.ESTestConfiguration;
 import com.linkedin.metadata.TestEntitySpecBuilder;
+import com.linkedin.metadata.config.search.WordGramConfiguration;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
@@ -65,11 +66,17 @@ public class SearchRequestHandlerTest extends AbstractTestNGSpringContextTests {
     exactMatchConfiguration.setCaseSensitivityFactor(0.7f);
     exactMatchConfiguration.setEnableStructured(true);
 
+    WordGramConfiguration wordGramConfiguration = new WordGramConfiguration();
+    wordGramConfiguration.setTwoGramFactor(1.2f);
+    wordGramConfiguration.setThreeGramFactor(1.5f);
+    wordGramConfiguration.setFourGramFactor(1.8f);
+
     PartialConfiguration partialConfiguration = new PartialConfiguration();
     partialConfiguration.setFactor(0.4f);
     partialConfiguration.setUrnFactor(0.7f);
 
     testQueryConfig.setExactMatch(exactMatchConfiguration);
+    testQueryConfig.setWordGram(wordGramConfiguration);
     testQueryConfig.setPartial(partialConfiguration);
   }
 
@@ -113,10 +120,10 @@ public void testSearchRequestHandler() {
     HighlightBuilder highlightBuilder = sourceBuilder.highlighter();
     List<String> fields =
         highlightBuilder.fields().stream().map(HighlightBuilder.Field::name).collect(Collectors.toList());
-    assertEquals(fields.size(), 20);
+    assertEquals(fields.size(), 22);
     List<String> highlightableFields =
         ImmutableList.of("keyPart1", "textArrayField", "textFieldOverride", "foreignKey", "nestedForeignKey",
-            "nestedArrayStringField", "nestedArrayArrayField", "customProperties", "esObjectField");
+            "nestedArrayStringField", "nestedArrayArrayField", "customProperties", "esObjectField", "wordGramField");
     highlightableFields.forEach(field -> {
       assertTrue(fields.contains(field), "Missing: " + field);
       assertTrue(fields.contains(field + ".*"), "Missing: " + field + ".*");
diff --git a/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl
index 4339a186f1304..5047c824e2617 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/chart/ChartInfo.pdl
@@ -20,7 +20,7 @@ record ChartInfo includes CustomProperties, ExternalReference {
    * Title of the chart
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true
   }
   title: string
diff --git a/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl
index 26745fe46caaa..0b9c89ea30c90 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/container/ContainerProperties.pdl
@@ -15,7 +15,7 @@ record ContainerProperties includes CustomProperties, ExternalReference {
    * Display name of the Asset Container
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
@@ -25,7 +25,7 @@ record ContainerProperties includes CustomProperties, ExternalReference {
    * Fully-qualified name of the Container
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
@@ -61,4 +61,4 @@ record ContainerProperties includes CustomProperties, ExternalReference {
     }
   }
   lastModified: optional TimeStamp
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl
index 5cb306039506e..84b3065a08022 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/dashboard/DashboardInfo.pdl
@@ -22,7 +22,7 @@ record DashboardInfo includes CustomProperties, ExternalReference {
    * Title of the dashboard
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
@@ -126,4 +126,4 @@ record DashboardInfo includes CustomProperties, ExternalReference {
    * The time when this dashboard last refreshed
    */
   lastRefreshed: optional Time
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl
index 481240740876a..1303bfbc863ea 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataFlowInfo.pdl
@@ -17,7 +17,7 @@ record DataFlowInfo includes CustomProperties, ExternalReference {
    * Flow name
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
diff --git a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl
index 8737dd4d9ef52..1e305816f96a2 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/datajob/DataJobInfo.pdl
@@ -18,7 +18,7 @@ record DataJobInfo includes CustomProperties, ExternalReference {
    * Job name
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl
index acc40e9f693ec..0be58d73dc79f 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/dataplatform/DataPlatformInfo.pdl
@@ -15,7 +15,7 @@ record DataPlatformInfo {
    */
   @validate.strlen.max = 15
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": false,
     "boostScore": 10.0
   }
@@ -25,7 +25,7 @@ record DataPlatformInfo {
    * The name that will be used for displaying a platform type.
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl
index d7ce5565103ee..1220741ee5726 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/dataplatforminstance/DataPlatformInstanceProperties.pdl
@@ -16,7 +16,7 @@ record DataPlatformInstanceProperties includes CustomProperties, ExternalReferen
    * Display name of the Data Platform Instance
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl
index 72eefd5e294e4..46a490dbb2925 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl
@@ -19,7 +19,7 @@ record DataProcessInstanceProperties includes CustomProperties, ExternalReferenc
    * Process name
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl
index 3861b7def7669..c0a50a5e0e688 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/dataproduct/DataProductProperties.pdl
@@ -13,7 +13,7 @@ record DataProductProperties includes CustomProperties, ExternalReference  {
    * Display name of the Data Product
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl
index 57b1fe7693129..49d0dcd58ee27 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/dataset/DatasetProperties.pdl
@@ -17,7 +17,7 @@ record DatasetProperties includes CustomProperties, ExternalReference {
    * Display name of the Dataset
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
@@ -27,7 +27,7 @@ record DatasetProperties includes CustomProperties, ExternalReference {
    * Fully-qualified name of the Dataset
    */
   @Searchable = {
-    "fieldType": "TEXT",
+    "fieldType": "WORD_GRAM",
     "addToFilters": false,
     "enableAutocomplete": true,
     "boostScore": 10.0
@@ -77,4 +77,4 @@ record DatasetProperties includes CustomProperties, ExternalReference {
    */
   @deprecated = "Use GlobalTags aspect instead."
   tags: array[string] = [ ]
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl
index 5a0b8657ecb47..a362d412a32b9 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/domain/DomainProperties.pdl
@@ -14,7 +14,7 @@ record DomainProperties {
    * Display name of the Domain
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
diff --git a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl
index 1e840e5a1df7e..557b5e2a0f419 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryNodeInfo.pdl
@@ -35,7 +35,7 @@ record GlossaryNodeInfo {
      */
     @Searchable = {
       "fieldName": "displayName",
-      "fieldType": "TEXT_PARTIAL",
+      "fieldType": "WORD_GRAM",
       "enableAutocomplete": true,
       "boostScore": 10.0
     }
@@ -49,4 +49,4 @@ record GlossaryNodeInfo {
     }
     id: optional string
 
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl
index aa2a8b31e3dde..13e7af311fba1 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/glossary/GlossaryTermInfo.pdl
@@ -23,7 +23,7 @@ record GlossaryTermInfo includes CustomProperties {
    * Display name of the term
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
@@ -75,4 +75,4 @@ record GlossaryTermInfo includes CustomProperties {
    */
   @deprecated
   rawSchema: optional string
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl
index 6b050f484fedd..48ee53377e582 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserEditableInfo.pdl
@@ -45,7 +45,7 @@ record CorpUserEditableInfo {
    * DataHub-native display name
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "queryByDefault": true,
     "boostScore": 10.0
   }
diff --git a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl
index 1cb705d426cc0..6cb0e8fd6aa6d 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/identity/CorpUserInfo.pdl
@@ -26,7 +26,7 @@ record CorpUserInfo includes CustomProperties {
    * displayName of this user ,  e.g.  Hang Zhang(DataHQ)
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "queryByDefault": true,
     "enableAutocomplete": true,
     "boostScore": 10.0
@@ -89,7 +89,7 @@ record CorpUserInfo includes CustomProperties {
    * Common name of this user, format is firstName + lastName (split by a whitespace)
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "queryByDefault": true,
     "enableAutocomplete": true,
     "boostScore": 10.0
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl
index 075cc14ddc83b..9e65b8f6e9929 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpGroupKey.pdl
@@ -11,10 +11,10 @@ record CorpGroupKey {
   * The URL-encoded name of the AD/LDAP group. Serves as a globally unique identifier within DataHub.
   */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "queryByDefault": true,
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
   name: string
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl
index d1a8a4bb5bb23..476a0ad9704b3 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/CorpUserKey.pdl
@@ -12,7 +12,7 @@ record CorpUserKey {
   */
   @Searchable = {
     "fieldName": "ldap",
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "boostScore": 2.0,
     "enableAutocomplete": true
   }
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl
index bcdb92f75d055..d8342630248b6 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataFlowKey.pdl
@@ -19,7 +19,7 @@ record DataFlowKey {
   * Unique Identifier of the data flow
   */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true
   }
   flowId: string
@@ -31,4 +31,4 @@ record DataFlowKey {
     "fieldType": "TEXT_PARTIAL"
   }
   cluster: string
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl
index d0ac7dbca0f99..60ec51b464dcc 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataJobKey.pdl
@@ -27,7 +27,7 @@ record DataJobKey {
   * Unique Identifier of the data job
   */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true
   }
   jobId: string
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl
index a5c05029352c2..4df1364a04ebe 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DataProcessKey.pdl
@@ -13,7 +13,7 @@ record DataProcessKey {
   * Process name i.e. an ETL job name
   */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 4.0
   }
@@ -37,4 +37,4 @@ record DataProcessKey {
     "queryByDefault": false
   }
   origin: FabricType
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl
index ea1f9510ed438..70c5d174171af 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/DatasetKey.pdl
@@ -25,7 +25,7 @@ record DatasetKey {
   //This is no longer to be used for Dataset native name. Use name, qualifiedName from DatasetProperties instead.
   @Searchable = {
     "fieldName": "id"
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl
index 88697fe3ff364..51a3bc00f4e9e 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryNodeKey.pdl
@@ -12,9 +12,9 @@ import com.linkedin.common.FabricType
 record GlossaryNodeKey {
 
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true
   }
   name: string
 
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl
index a9f35146da18e..61bcd60cbc754 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/GlossaryTermKey.pdl
@@ -13,10 +13,10 @@ record GlossaryTermKey {
   * The term name, which serves as a unique id
   */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "fieldName": "id"
   }
   name: string
 
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl
index 579f1966977a9..0dcb194bccce0 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureKey.pdl
@@ -20,9 +20,9 @@ record MLFeatureKey {
    * Name of the feature
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 8.0
   }
   name: string
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl
index 1f786ad417be7..880daa4423573 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLFeatureTableKey.pdl
@@ -22,9 +22,9 @@ record MLFeatureTableKey {
   * Name of the feature table
   */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 8.0
   }
   name: string
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl
index 7c36f410fede3..83ba35e0af601 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelDeploymentKey.pdl
@@ -19,7 +19,7 @@ record MLModelDeploymentKey {
   * Name of the MLModelDeployment
   */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
@@ -35,4 +35,4 @@ record MLModelDeploymentKey {
     "queryByDefault": false
   }
   origin: FabricType
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl
index 17c401c0b8c48..b1e2b7b7ede70 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelGroupKey.pdl
@@ -19,7 +19,7 @@ record MLModelGroupKey {
   * Name of the MLModelGroup
   */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
@@ -33,4 +33,4 @@ record MLModelGroupKey {
     "queryByDefault": false
   }
   origin: FabricType
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl
index 55fd2bc370846..24fe89dcce654 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLModelKey.pdl
@@ -19,7 +19,7 @@ record MLModelKey {
   * Name of the MLModel
   */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
@@ -35,4 +35,4 @@ record MLModelKey {
     "queryByDefault": false
   }
   origin: FabricType
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl
index 9eb67eaf5f651..7987f3a3345b7 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/MLPrimaryKeyKey.pdl
@@ -21,9 +21,9 @@ record MLPrimaryKeyKey {
   * Name of the primary key
   */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 8.0
   }
   name: string
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl
index 47f1a631b4a2c..4622e32dce67b 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/metadata/key/TagKey.pdl
@@ -11,10 +11,10 @@ record TagKey {
   * The tag name, which serves as a unique id
   */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0,
     "fieldName": "id"
   }
   name: string
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl
index 1f4dcf975f48c..5df4daacffa49 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/notebook/NotebookInfo.pdl
@@ -18,7 +18,7 @@ record NotebookInfo includes CustomProperties, ExternalReference {
    * Title of the Notebook
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
diff --git a/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl b/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl
index 004df6e399be4..3e7b53beff531 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/ownership/OwnershipTypeInfo.pdl
@@ -14,7 +14,7 @@ record OwnershipTypeInfo {
    * Display name of the Ownership Type
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
@@ -54,4 +54,4 @@ record OwnershipTypeInfo {
      }
    }
    lastModified: AuditStamp
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl
index bb7e22900e168..3ba19d348913b 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/query/QueryProperties.pdl
@@ -29,7 +29,7 @@ record QueryProperties {
    * Optional display name to identify the query.
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
@@ -69,4 +69,4 @@ record QueryProperties {
     }
   }
   lastModified: AuditStamp
-}
\ No newline at end of file
+}
diff --git a/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl
index acebdf5558c59..84d8ecc379ec2 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/role/RoleProperties.pdl
@@ -14,7 +14,7 @@ record RoleProperties {
    * Display name of the IAM Role in the external system
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
diff --git a/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl
index 41c500c6fff2f..e808aef491749 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/tag/TagProperties.pdl
@@ -11,7 +11,7 @@ record TagProperties {
    * Display name of the tag
    */
   @Searchable = {
-    "fieldType": "TEXT_PARTIAL",
+    "fieldType": "WORD_GRAM",
     "enableAutocomplete": true,
     "boostScore": 10.0
   }
diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java
index 1a56db1bd68b0..b2b5260dc5e70 100644
--- a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java
+++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/SearchConfiguration.java
@@ -11,4 +11,5 @@ public class SearchConfiguration {
   private PartialConfiguration partial;
   private CustomConfiguration custom;
   private GraphQueryConfiguration graph;
+  private WordGramConfiguration wordGram;
 }
diff --git a/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java
new file mode 100644
index 0000000000000..624d2a4c63c4c
--- /dev/null
+++ b/metadata-service/configuration/src/main/java/com/linkedin/metadata/config/search/WordGramConfiguration.java
@@ -0,0 +1,11 @@
+package com.linkedin.metadata.config.search;
+
+import lombok.Data;
+
+
+@Data
+public class WordGramConfiguration {
+  private float twoGramFactor;
+  private float threeGramFactor;
+  private float fourGramFactor;
+}
diff --git a/metadata-service/configuration/src/main/resources/application.yml b/metadata-service/configuration/src/main/resources/application.yml
index 9f7bf92039fdc..82cf9e8fdc8a7 100644
--- a/metadata-service/configuration/src/main/resources/application.yml
+++ b/metadata-service/configuration/src/main/resources/application.yml
@@ -198,6 +198,10 @@ elasticsearch:
       prefixFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_PREFIX_FACTOR:1.6} # boost multiplier when exact prefix
       caseSensitivityFactor: ${ELASTICSEARCH_QUERY_EXACT_MATCH_CASE_FACTOR:0.7} # stacked boost multiplier when case mismatch
       enableStructured: ${ELASTICSEARCH_QUERY_EXACT_MATCH_ENABLE_STRUCTURED:true} # enable exact match on structured search
+    wordGram:
+      twoGramFactor: ${ELASTICSEARCH_QUERY_TWO_GRAM_FACTOR:1.2} # boost multiplier when match on 2-gram tokens
+      threeGramFactor: ${ELASTICSEARCH_QUERY_THREE_GRAM_FACTOR:1.5} # boost multiplier when match on 3-gram tokens
+      fourGramFactor: ${ELASTICSEARCH_QUERY_FOUR_GRAM_FACTOR:1.8} # boost multiplier when match on 4-gram tokens
     # Field weight annotations are typically calibrated for exact match, if partial match is possible on the field use these adjustments
     partial:
       urnFactor: ${ELASTICSEARCH_QUERY_PARTIAL_URN_FACTOR:0.5} # multiplier on Urn token match, a partial match on Urn > non-Urn is assumed
@@ -318,4 +322,4 @@ cache:
   search:
     lineage:
       ttlSeconds: ${CACHE_SEARCH_LINEAGE_TTL_SECONDS:86400} # 1 day
-      lightningThreshold: ${CACHE_SEARCH_LINEAGE_LIGHTNING_THRESHOLD:300}
\ No newline at end of file
+      lightningThreshold: ${CACHE_SEARCH_LINEAGE_LIGHTNING_THRESHOLD:300}
diff --git a/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl b/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl
index ed30244c31b17..cc579ba488174 100644
--- a/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl
+++ b/test-models/src/main/pegasus/com/datahub/test/TestEntityInfo.pdl
@@ -25,6 +25,11 @@ record TestEntityInfo includes CustomProperties {
   }
   textArrayField: optional array[string]
 
+  @Searchable = {
+    "fieldType": "WORD_GRAM"
+  }
+  wordGramField: optional string
+
   @Relationship = {
     "name": "foreignKey",
     "entityTypes": []

From 655914841bc6c840839ca0cdce751e4e11b6f06f Mon Sep 17 00:00:00 2001
From: david-leifker <114954101+david-leifker@users.noreply.github.com>
Date: Tue, 22 Aug 2023 01:08:08 -0400
Subject: [PATCH 03/11] fix(docker): misc docker fixes (#8677)

---
 .github/workflows/docker-unified.yml          | 56 ++++++++++---------
 docker/build.gradle                           |  1 +
 docker/datahub-ingestion-base/Dockerfile      |  2 +-
 docker/datahub-ingestion/Dockerfile           |  2 +-
 docker/kafka-setup/Dockerfile                 | 14 ++---
 docker/kafka-setup/kafka-ready.sh             | 14 +++++
 docker/kafka-setup/kafka-setup.sh             |  4 +-
 .../fixtures/ElasticSearchGoldenTest.java     |  2 +
 .../DataProcessInstanceProperties.pdl         |  1 +
 smoke-test/run-quickstart.sh                  |  2 +-
 .../tests/cypress/cypress/e2e/login/login.js  |  2 +-
 .../cypress/e2e/settings/managing_groups.js   | 10 ++--
 12 files changed, 64 insertions(+), 46 deletions(-)
 create mode 100755 docker/kafka-setup/kafka-ready.sh
 mode change 100644 => 100755 docker/kafka-setup/kafka-setup.sh

diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml
index c268a66938945..e8e12ac6def94 100644
--- a/.github/workflows/docker-unified.yml
+++ b/.github/workflows/docker-unified.yml
@@ -63,8 +63,8 @@ jobs:
         env:
           ENABLE_PUBLISH: ${{ secrets.DOCKER_PASSWORD != '' && secrets.ACRYL_DOCKER_PASSWORD != '' }}
         run: |
-          echo "Enable publish: ${{ env.ENABLE_PUBLISH != '' }}"
-          echo "publish=${{ env.ENABLE_PUBLISH != '' }}" >> $GITHUB_OUTPUT
+          echo "Enable publish: ${{ env.ENABLE_PUBLISH }}"
+          echo "publish=${{ env.ENABLE_PUBLISH }}" >> $GITHUB_OUTPUT
 
   gms_build:
     name: Build and Push DataHub GMS Docker Image
@@ -451,8 +451,6 @@ jobs:
           tags: ${{ needs.setup.outputs.tag }}
           username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
           password: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
-          build-args: |
-            DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
           publish: ${{ needs.setup.outputs.publish }}
           context: .
           file: ./docker/datahub-ingestion-base/Dockerfile
@@ -481,7 +479,7 @@ jobs:
         uses: ishworkh/docker-image-artifact-download@v1
         if: ${{ needs.setup.outputs.publish != 'true' &&  steps.filter.outputs.datahub-ingestion-base == 'true' }}
         with:
-          image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
+          image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }}
       - name: Build and push Base-Slim Image
         if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' }}
         uses: ./.github/actions/docker-custom-build-and-push
@@ -493,16 +491,15 @@ jobs:
           username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
           password: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
           build-args: |
-            DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
             APP_ENV=slim
-            BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
+            BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }}
           publish: ${{ needs.setup.outputs.publish }}
           context: .
           file: ./docker/datahub-ingestion-base/Dockerfile
           platforms: linux/amd64,linux/arm64/v8
       - name: Compute DataHub Ingestion (Base-Slim) Tag
         id: tag
-        run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.slim_tag || 'head' }}" >> $GITHUB_OUTPUT
+        run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }}" >> $GITHUB_OUTPUT
   datahub_ingestion_base_full_build:
     name: Build and Push DataHub Ingestion (Base-Full) Docker Image
     runs-on: ubuntu-latest
@@ -524,7 +521,7 @@ jobs:
         uses: ishworkh/docker-image-artifact-download@v1
         if: ${{ needs.setup.outputs.publish != 'true' &&  steps.filter.outputs.datahub-ingestion-base == 'true' }}
         with:
-          image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
+          image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }}
       - name: Build and push Base-Full Image
         if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' }}
         uses: ./.github/actions/docker-custom-build-and-push
@@ -532,20 +529,19 @@ jobs:
           target: full-install
           images: |
             ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}
-          tags: ${{ needs.setup.outputs.full_tag }}
+          tags: ${{ needs.setup.outputs.unique_full_tag }}
           username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
           password: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
           build-args: |
-            DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
             APP_ENV=full
-            BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.tag || 'head' }}
+            BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_tag || 'head' }}
           publish: ${{ needs.setup.outputs.publish }}
           context: .
           file: ./docker/datahub-ingestion-base/Dockerfile
           platforms: linux/amd64,linux/arm64/v8
       - name: Compute DataHub Ingestion (Base-Full) Tag
         id: tag
-        run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.full_tag || 'head' }}" >> $GITHUB_OUTPUT
+        run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT
 
 
   datahub_ingestion_slim_build:
@@ -572,9 +568,9 @@ jobs:
         run: ./gradlew :metadata-ingestion:codegen
       - name: Download Base Image
         uses: ishworkh/docker-image-artifact-download@v1
-        if: ${{ needs.setup.outputs.publish != 'true' }}
+        if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }}
         with:
-          image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.slim_tag || 'head' }}
+          image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }}
       - name: Build and push Slim Image
         if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }}
         uses: ./.github/actions/docker-custom-build-and-push
@@ -584,7 +580,7 @@ jobs:
             ${{ env.DATAHUB_INGESTION_IMAGE }}
           build-args: |
             BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}
-            DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.slim_tag || 'head' }}
+            DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_slim_tag || 'head' }}
             APP_ENV=slim
           tags: ${{ needs.setup.outputs.slim_tag }}
           username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
@@ -595,7 +591,7 @@ jobs:
           platforms: linux/amd64,linux/arm64/v8
       - name: Compute Tag
         id: tag
-        run: echo "tag=${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.slim_tag || 'head' }}" >> $GITHUB_OUTPUT
+        run: echo "tag=${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.unique_slim_tag || 'head' }}" >> $GITHUB_OUTPUT
   datahub_ingestion_slim_scan:
     permissions:
       contents: read # for actions/checkout to fetch code
@@ -611,13 +607,13 @@ jobs:
         uses: ishworkh/docker-image-artifact-download@v1
         if: ${{ needs.setup.outputs.publish != 'true' }}
         with:
-          image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.slim_tag }}
+          image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }}
       - name: Run Trivy vulnerability scanner Slim Image
         uses: aquasecurity/trivy-action@0.8.0
         env:
           TRIVY_OFFLINE_SCAN: true
         with:
-          image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.slim_tag }}
+          image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }}
           format: "template"
           template: "@/contrib/sarif.tpl"
           output: "trivy-results.sarif"
@@ -653,9 +649,9 @@ jobs:
         run: ./gradlew :metadata-ingestion:codegen
       - name: Download Base Image
         uses: ishworkh/docker-image-artifact-download@v1
-        if: ${{ needs.setup.outputs.publish != 'true' }}
+        if: ${{ needs.setup.outputs.publish != 'true' && steps.filter.outputs.datahub-ingestion-base == 'true' }}
         with:
-          image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.full_tag || 'head' }}
+          image: ${{ env.DATAHUB_INGESTION_BASE_IMAGE }}:${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}
       - name: Build and push Full Image
         if: ${{ steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true' }}
         uses: ./.github/actions/docker-custom-build-and-push
@@ -665,8 +661,8 @@ jobs:
             ${{ env.DATAHUB_INGESTION_IMAGE }}
           build-args: |
             BASE_IMAGE=${{ env.DATAHUB_INGESTION_BASE_IMAGE }}
-            DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.full_tag || 'head' }}
-          tags: ${{ needs.setup.outputs.full_tag }}
+            DOCKER_VERSION=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}
+          tags: ${{ needs.setup.outputs.unique_full_tag }}
           username: ${{ secrets.ACRYL_DOCKER_USERNAME }}
           password: ${{ secrets.ACRYL_DOCKER_PASSWORD }}
           publish: ${{ needs.setup.outputs.publish }}
@@ -675,7 +671,7 @@ jobs:
           platforms: linux/amd64,linux/arm64/v8
       - name: Compute Tag (Full)
         id: tag
-        run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.full_tag || 'head' }}" >> $GITHUB_OUTPUT
+        run: echo "tag=${{ steps.filter.outputs.datahub-ingestion-base == 'true' && needs.setup.outputs.unique_full_tag || 'head' }}" >> $GITHUB_OUTPUT
   datahub_ingestion_full_scan:
     permissions:
       contents: read # for actions/checkout to fetch code
@@ -691,13 +687,13 @@ jobs:
         uses: ishworkh/docker-image-artifact-download@v1
         if: ${{ needs.setup.outputs.publish != 'true' }}
         with:
-          image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.full_tag }}
+          image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.tag }}
       - name: Run Trivy vulnerability scanner Full Image
         uses: aquasecurity/trivy-action@0.8.0
         env:
           TRIVY_OFFLINE_SCAN: true
         with:
-          image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.full_tag }}
+          image-ref: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.tag }}
           format: "template"
           template: "@/contrib/sarif.tpl"
           output: "trivy-results.sarif"
@@ -750,6 +746,10 @@ jobs:
           ./gradlew :metadata-ingestion:install
       - name: Disk Check
         run: df -h . && docker images
+      - name: Remove images
+        run: docker image prune -a -f || true
+      - name: Disk Check
+        run: df -h . && docker images
       - name: Download GMS image
         uses: ishworkh/docker-image-artifact-download@v1
         if: ${{ needs.setup.outputs.publish != 'true' }}
@@ -794,7 +794,7 @@ jobs:
         uses: ishworkh/docker-image-artifact-download@v1
         if: ${{ needs.setup.outputs.publish != 'true' }}
         with:
-          image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.setup.outputs.unique_tag }}
+          image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }}
       - name: Disk Check
         run: df -h . && docker images
       - name: run quickstart
@@ -812,6 +812,8 @@ jobs:
           # we are doing this because gms takes time to get ready
           # and we don't have a better readiness check when bootstrap is done
           sleep 60s
+      - name: Disk Check
+        run: df -h . && docker images
       - name: Disable ES Disk Threshold
         run: |
           curl -XPUT "http://localhost:9200/_cluster/settings" \
diff --git a/docker/build.gradle b/docker/build.gradle
index 829bc344411f3..ae101fe1defc5 100644
--- a/docker/build.gradle
+++ b/docker/build.gradle
@@ -87,6 +87,7 @@ task quickstartDebug(type: Exec, dependsOn: ':metadata-ingestion:install') {
     dependsOn(debug_modules.collect { it + ':dockerTagDebug' })
     shouldRunAfter ':metadata-ingestion:clean', 'quickstartNuke'
 
+    environment "DATAHUB_PRECREATE_TOPICS", "true"
     environment "DATAHUB_TELEMETRY_ENABLED", "false"
     environment "DOCKER_COMPOSE_BASE", "file://${rootProject.projectDir}"
 
diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile
index bb4b0bc42e167..3d47f79617370 100644
--- a/docker/datahub-ingestion-base/Dockerfile
+++ b/docker/datahub-ingestion-base/Dockerfile
@@ -84,4 +84,4 @@ FROM ${BASE_IMAGE} as slim-install
 FROM ${APP_ENV}-install
 
 USER datahub
-ENV PATH="/datahub-ingestion/.local/bin:$PATH"
+ENV PATH="/datahub-ingestion/.local/bin:$PATH"
\ No newline at end of file
diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile
index d16caea2fcecd..0ecc30d02ac3f 100644
--- a/docker/datahub-ingestion/Dockerfile
+++ b/docker/datahub-ingestion/Dockerfile
@@ -30,4 +30,4 @@ FROM base as dev-install
 FROM ${APP_ENV}-install as final
 
 USER datahub
-ENV PATH="/datahub-ingestion/.local/bin:$PATH"
\ No newline at end of file
+ENV PATH="/datahub-ingestion/.local/bin:$PATH"
diff --git a/docker/kafka-setup/Dockerfile b/docker/kafka-setup/Dockerfile
index 8cf9d0869dc9b..5707234b85f57 100644
--- a/docker/kafka-setup/Dockerfile
+++ b/docker/kafka-setup/Dockerfile
@@ -1,5 +1,7 @@
+ARG KAFKA_DOCKER_VERSION=7.4.1
+
 # Using as a base image because to get the needed jars for confluent utils
-FROM confluentinc/cp-base-new@sha256:ac4e0f9bcaecdab728740529f37452231fa40760fcf561759fc3b219f46d2cc9 as confluent_base
+FROM confluentinc/cp-base-new:$KAFKA_DOCKER_VERSION as confluent_base
 
 ARG MAVEN_REPO="https://repo1.maven.org/maven2"
 ARG SNAKEYAML_VERSION="2.0"
@@ -16,12 +18,6 @@ ENV SCALA_VERSION 2.13
 # Set the classpath for JARs required by `cub`
 ENV CUB_CLASSPATH='"/usr/share/java/cp-base-new/*"'
 
-# Confluent Docker Utils Version (Namely the tag or branch to grab from git to install)
-ARG PYTHON_CONFLUENT_DOCKER_UTILS_VERSION="v0.0.60"
-
-# This can be overriden for an offline/air-gapped builds
-ARG PYTHON_CONFLUENT_DOCKER_UTILS_INSTALL_SPEC="git+https://github.com/confluentinc/confluent-docker-utils@${PYTHON_CONFLUENT_DOCKER_UTILS_VERSION}"
-
 LABEL name="kafka" version=${KAFKA_VERSION}
 
 RUN apk add --no-cache bash coreutils
@@ -39,7 +35,6 @@ RUN mkdir -p /opt \
   && pip install --no-cache-dir --upgrade pip wheel setuptools \
   && pip install jinja2 requests \
   && pip install "Cython<3.0" "PyYAML<6" --no-build-isolation \
-  && pip install --prefer-binary --prefix=/usr/local --upgrade "${PYTHON_CONFLUENT_DOCKER_UTILS_INSTALL_SPEC}" \
   && rm -rf /tmp/* \
   && apk del --purge .build-deps
 
@@ -69,7 +64,8 @@ ENV USE_CONFLUENT_SCHEMA_REGISTRY="TRUE"
 COPY docker/kafka-setup/kafka-setup.sh ./kafka-setup.sh
 COPY docker/kafka-setup/kafka-config.sh ./kafka-config.sh
 COPY docker/kafka-setup/kafka-topic-workers.sh ./kafka-topic-workers.sh
+COPY docker/kafka-setup/kafka-ready.sh ./kafka-ready.sh
 
-RUN chmod +x ./kafka-setup.sh && chmod +x ./kafka-topic-workers.sh
+RUN chmod +x ./kafka-setup.sh ./kafka-topic-workers.sh ./kafka-ready.sh
 
 CMD ./kafka-setup.sh
diff --git a/docker/kafka-setup/kafka-ready.sh b/docker/kafka-setup/kafka-ready.sh
new file mode 100755
index 0000000000000..ba87bde047ef5
--- /dev/null
+++ b/docker/kafka-setup/kafka-ready.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+for i in {1..60}
+do
+  kafka-broker-api-versions.sh --command-config $CONNECTION_PROPERTIES_PATH --bootstrap-server $KAFKA_BOOTSTRAP_SERVER
+  if [ $? -eq 0 ]; then
+    break
+  fi
+  if [ $i -eq 60 ]; then
+    echo "Kafka bootstrap server $KAFKA_BOOTSTRAP_SERVER not ready."
+    exit 1
+  fi
+  sleep 5s
+done
diff --git a/docker/kafka-setup/kafka-setup.sh b/docker/kafka-setup/kafka-setup.sh
old mode 100644
new mode 100755
index 7b015421b7963..629e9bc9484ee
--- a/docker/kafka-setup/kafka-setup.sh
+++ b/docker/kafka-setup/kafka-setup.sh
@@ -49,8 +49,8 @@ if [[ -n "$KAFKA_PROPERTIES_SASL_CLIENT_CALLBACK_HANDLER_CLASS" ]]; then
     echo "sasl.client.callback.handler.class=$KAFKA_PROPERTIES_SASL_CLIENT_CALLBACK_HANDLER_CLASS" >> $CONNECTION_PROPERTIES_PATH
 fi
 
-cub kafka-ready -c $CONNECTION_PROPERTIES_PATH -b $KAFKA_BOOTSTRAP_SERVER 1 180
-
+# cub kafka-ready -c $CONNECTION_PROPERTIES_PATH -b $KAFKA_BOOTSTRAP_SERVER 1 180
+. kafka-ready.sh
 
 ############################################################
 # Start Topic Creation Logic
diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java
index 29457f244291f..8e8c20bd292e5 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java
@@ -15,6 +15,7 @@
 import org.springframework.beans.factory.annotation.Qualifier;
 import org.springframework.context.annotation.Import;
 import org.springframework.test.context.testng.AbstractTestNGSpringContextTests;
+import org.testng.annotations.Ignore;
 import org.testng.annotations.Test;
 
 import java.util.List;
@@ -96,6 +97,7 @@ public void testNameMatchMemberInWorkspace() {
     }
 
     @Test
+    @Ignore("unstable")
     public void testGlossaryTerms() {
         /*
           Searching for "ReturnRate" should return all tables that have the glossary term applied before
diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl
index 46a490dbb2925..c63cb1a97c017 100644
--- a/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl
+++ b/metadata-models/src/main/pegasus/com/linkedin/dataprocess/DataProcessInstanceProperties.pdl
@@ -31,6 +31,7 @@ record DataProcessInstanceProperties includes CustomProperties, ExternalReferenc
   @Searchable = {
     "fieldType": "KEYWORD",
     "addToFilters": true,
+    "fieldName": "processType",
     "filterNameOverride": "Process Type"
   }
   type: optional enum DataProcessType {
diff --git a/smoke-test/run-quickstart.sh b/smoke-test/run-quickstart.sh
index 050b5d2db95c9..d40e4a5e7a4aa 100755
--- a/smoke-test/run-quickstart.sh
+++ b/smoke-test/run-quickstart.sh
@@ -15,4 +15,4 @@ echo "test_user:test_pass" >> ~/.datahub/plugins/frontend/auth/user.props
 echo "DATAHUB_VERSION = $DATAHUB_VERSION"
 DATAHUB_TELEMETRY_ENABLED=false  \
 DOCKER_COMPOSE_BASE="file://$( dirname "$DIR" )" \
-datahub docker quickstart --version ${DATAHUB_VERSION} --standalone_consumers --dump-logs-on-failure --kafka-setup
+datahub docker quickstart --version ${DATAHUB_VERSION} --standalone_consumers --dump-logs-on-failure --kafka-setup
\ No newline at end of file
diff --git a/smoke-test/tests/cypress/cypress/e2e/login/login.js b/smoke-test/tests/cypress/cypress/e2e/login/login.js
index f86741b5afe01..74d04aa56d0d0 100644
--- a/smoke-test/tests/cypress/cypress/e2e/login/login.js
+++ b/smoke-test/tests/cypress/cypress/e2e/login/login.js
@@ -4,6 +4,6 @@ describe('login', () => {
     cy.get('input[data-testid=username]').type(Cypress.env('ADMIN_USERNAME'));
     cy.get('input[data-testid=password]').type(Cypress.env('ADMIN_PASSWORD'));
     cy.contains('Sign In').click();
-    cy.contains('Welcome back, DataHub');
+    cy.contains('Welcome back, Data Hub');
   });
 })
diff --git a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js
index 7686acfe50de0..353570c0d955b 100644
--- a/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js
+++ b/smoke-test/tests/cypress/cypress/e2e/settings/managing_groups.js
@@ -64,6 +64,8 @@ describe("create and manage group", () => {
     });
 
     it("update group info", () => {
+        var expected_name = Cypress.env('ADMIN_USERNAME') == "datahub" ? "Data Hub" : Cypress.env('ADMIN_USERNAME');
+
         cy.loginWithCredentials();
         cy.visit("/settings/identities/groups");
         cy.clickOptionWithText(group_name);
@@ -77,13 +79,13 @@ describe("create and manage group", () => {
         cy.contains("Test group description EDITED").should("be.visible");
         cy.clickOptionWithText("Add Owners");
         cy.contains("Search for users or groups...").click({ force: true });
-        cy.focused().type(Cypress.env('ADMIN_USERNAME'));
-        cy.get(".ant-select-item-option").contains(Cypress.env('ADMIN_USERNAME'), { matchCase: false }).click();
+        cy.focused().type(expected_name);
+        cy.get(".ant-select-item-option").contains(expected_name, { matchCase: false }).click();
         cy.focused().blur();
-        cy.contains(Cypress.env('ADMIN_USERNAME')).should("have.length", 1);
+        cy.contains(expected_name).should("have.length", 1);
         cy.get('[role="dialog"] button').contains("Done").click();
         cy.waitTextVisible("Owners Added");
-        cy.contains(Cypress.env('ADMIN_USERNAME'), { matchCase: false }).should("be.visible");
+        cy.contains(expected_name, { matchCase: false }).should("be.visible");
         cy.clickOptionWithText("Edit Group");
         cy.waitTextVisible("Edit Profile");
         cy.get("#email").type(`${test_id}@testemail.com`);

From b0cb990bad0522ea77fabab6f4746f1fd6d4ba23 Mon Sep 17 00:00:00 2001
From: Ellie O'Neil <110510035+eboneil@users.noreply.github.com>
Date: Tue, 22 Aug 2023 11:35:58 -0700
Subject: [PATCH 04/11] tests(search): more golden tests (#8683)

---
 .../fixtures/ElasticSearchGoldenTest.java     | 69 +++++++++++++------
 1 file changed, 47 insertions(+), 22 deletions(-)

diff --git a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java
index 8e8c20bd292e5..d720c95fef84d 100644
--- a/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java
+++ b/metadata-io/src/test/java/com/linkedin/metadata/search/elasticsearch/fixtures/ElasticSearchGoldenTest.java
@@ -15,7 +15,6 @@
 import org.springframework.beans.factory.annotation.Qualifier;
 import org.springframework.context.annotation.Import;
 import org.springframework.test.context.testng.AbstractTestNGSpringContextTests;
-import org.testng.annotations.Ignore;
 import org.testng.annotations.Test;
 
 import java.util.List;
@@ -80,24 +79,6 @@ public void testNameMatchPetProfile() {
     }
 
     @Test
-    public void testNameMatchMemberInWorkspace() {
-        /*
-          Searching for "collaborative actionitems" should return "collaborative_actionitems" as the first search
-          result, followed by "collaborative_actionitems_old"
-         */
-        assertNotNull(searchService);
-        SearchResult searchResult = searchAcrossEntities(searchService, "collaborative actionitems", SEARCHABLE_LONGTAIL_ENTITIES);
-        assertTrue(searchResult.getEntities().size() >= 2);
-        Urn firstResultUrn = searchResult.getEntities().get(0).getEntity();
-        Urn secondResultUrn = searchResult.getEntities().get(1).getEntity();
-
-        // Checks that the table name is not suffixed with anything
-        assertTrue(firstResultUrn.toString().contains("collaborative_actionitems,"));
-        assertTrue(secondResultUrn.toString().contains("collaborative_actionitems_old"));
-    }
-
-    @Test
-    @Ignore("unstable")
     public void testGlossaryTerms() {
         /*
           Searching for "ReturnRate" should return all tables that have the glossary term applied before
@@ -134,9 +115,53 @@ public void testNameMatchPartiallyQualified() {
         assertTrue(secondResultUrn.toString().contains("dbt,long_tail_companions.analytics.pet_details"));
     }
 
+    @Test
+    public void testNameMatchCollaborativeActionitems() {
+        /*
+          Searching for "collaborative actionitems" should return "collaborative_actionitems" as the first search
+          result, followed by "collaborative_actionitems_old"
+         */
+        assertNotNull(searchService);
+        SearchResult searchResult = searchAcrossEntities(searchService, "collaborative actionitems", SEARCHABLE_LONGTAIL_ENTITIES);
+        assertTrue(searchResult.getEntities().size() >= 2);
+        Urn firstResultUrn = searchResult.getEntities().get(0).getEntity();
+        Urn secondResultUrn = searchResult.getEntities().get(1).getEntity();
+
+        // Checks that the table name is not suffixed with anything
+        assertTrue(firstResultUrn.toString().contains("collaborative_actionitems,"));
+        assertTrue(secondResultUrn.toString().contains("collaborative_actionitems_old"));
+
+        Double firstResultScore = searchResult.getEntities().get(0).getScore();
+        Double secondResultScore = searchResult.getEntities().get(1).getScore();
+
+        // Checks that the scores aren't tied so that we are matching on table name more than column name
+        assertTrue(firstResultScore > secondResultScore);
+    }
+
+    @Test
+    public void testNameMatchCustomerOrders() {
+        /*
+          Searching for "customer orders" should return "customer_orders" as the first search
+          result, not suffixed by anything
+         */
+        assertNotNull(searchService);
+        SearchResult searchResult = searchAcrossEntities(searchService, "customer orders", SEARCHABLE_LONGTAIL_ENTITIES);
+        assertTrue(searchResult.getEntities().size() >= 2);
+        Urn firstResultUrn = searchResult.getEntities().get(0).getEntity();
+
+        // Checks that the table name is not suffixed with anything
+        assertTrue(firstResultUrn.toString().contains("customer_orders,"));
+
+        Double firstResultScore = searchResult.getEntities().get(0).getScore();
+        Double secondResultScore = searchResult.getEntities().get(1).getScore();
+
+        // Checks that the scores aren't tied so that we are matching on table name more than column name
+        assertTrue(firstResultScore > secondResultScore);
+    }
+
     /*
-     * Tests that should pass but do not yet can be added below here, with the following annotation:
-     * @Test(enabled = false)
-     **/
+      Tests that should pass but do not yet can be added below here, with the following annotation:
+      @Test(enabled = false)
+     */
 
 }

From 439cf4d7dcde7003de3a3fbe02339cbf72c7246a Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz <andrew.sikowitz@acryl.io>
Date: Tue, 22 Aug 2023 16:27:46 -0400
Subject: [PATCH 05/11] test(ingest/vertica): Skip integration test failing CI;
 support arm Macs (#8694)

---
 .../tests/integration/vertica/docker-compose.yml              | 4 +---
 metadata-ingestion/tests/integration/vertica/test_vertica.py  | 1 +
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/metadata-ingestion/tests/integration/vertica/docker-compose.yml b/metadata-ingestion/tests/integration/vertica/docker-compose.yml
index ddaf206f236cf..84af5c32a60e3 100644
--- a/metadata-ingestion/tests/integration/vertica/docker-compose.yml
+++ b/metadata-ingestion/tests/integration/vertica/docker-compose.yml
@@ -1,6 +1,7 @@
 version: "3.9"
 services:
   vertica:
+    platform: linux/amd64
     environment:
       APP_DB_USER: "dbadmin"
       APP_DB_PASSWORD: "abc123"
@@ -18,6 +19,3 @@ services:
 
 volumes:
   vertica-data:
-
-
-
diff --git a/metadata-ingestion/tests/integration/vertica/test_vertica.py b/metadata-ingestion/tests/integration/vertica/test_vertica.py
index db8bfd247313b..fe306d1d0b2b8 100644
--- a/metadata-ingestion/tests/integration/vertica/test_vertica.py
+++ b/metadata-ingestion/tests/integration/vertica/test_vertica.py
@@ -58,6 +58,7 @@ def vertica_runner(docker_compose_runner, test_resources_dir):
 
 # Test needs more work to be done , currently it is working fine.
 @freeze_time(FROZEN_TIME)
+@pytest.mark.skip("Failing in CI, cmd failing with exit code 1")
 @pytest.mark.integration
 def test_vertica_ingest_with_db(vertica_runner, pytestconfig, tmp_path):
     test_resources_dir = pytestconfig.rootpath / "tests/integration/vertica"

From d6e36f16de0f9b776767a898e7f64eb972ed8987 Mon Sep 17 00:00:00 2001
From: Harshal Sheth <hsheth2@gmail.com>
Date: Tue, 22 Aug 2023 21:27:02 -0700
Subject: [PATCH 06/11] ci: add `needs_artifact_download` output for ingestion
 image (#8695)

---
 .github/workflows/docker-unified.yml | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/docker-unified.yml b/.github/workflows/docker-unified.yml
index e8e12ac6def94..532669c44722c 100644
--- a/.github/workflows/docker-unified.yml
+++ b/.github/workflows/docker-unified.yml
@@ -549,6 +549,7 @@ jobs:
     runs-on: ubuntu-latest
     outputs:
       tag: ${{ steps.tag.outputs.tag }}
+      needs_artifact_download: ${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.publish != 'true' }}
     needs: [setup, datahub_ingestion_base_slim_build]
     steps:
       - name: Check out the repo
@@ -605,7 +606,7 @@ jobs:
         uses: actions/checkout@v3
       - name: Download image Slim Image
         uses: ishworkh/docker-image-artifact-download@v1
-        if: ${{ needs.setup.outputs.publish != 'true' }}
+        if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' }}
         with:
           image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }}
       - name: Run Trivy vulnerability scanner Slim Image
@@ -630,6 +631,7 @@ jobs:
     runs-on: ubuntu-latest
     outputs:
       tag: ${{ steps.tag.outputs.tag }}
+      needs_artifact_download: ${{ (steps.filter.outputs.datahub-ingestion-base == 'true' || steps.filter.outputs.datahub-ingestion == 'true') && needs.setup.outputs.publish != 'true' }}
     needs: [setup, datahub_ingestion_base_full_build]
     steps:
       - name: Check out the repo
@@ -685,7 +687,7 @@ jobs:
         uses: actions/checkout@v3
       - name: Download image Full Image
         uses: ishworkh/docker-image-artifact-download@v1
-        if: ${{ needs.setup.outputs.publish != 'true' }}
+        if: ${{ needs.datahub_ingestion_full_build.outputs.needs_artifact_download == 'true' }}
         with:
           image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_full_build.outputs.tag }}
       - name: Run Trivy vulnerability scanner Full Image
@@ -792,7 +794,7 @@ jobs:
           image: ${{ env.DATAHUB_UPGRADE_IMAGE }}:${{ needs.setup.outputs.unique_tag }}
       - name: Download datahub-ingestion-slim image
         uses: ishworkh/docker-image-artifact-download@v1
-        if: ${{ needs.setup.outputs.publish != 'true' }}
+        if: ${{ needs.datahub_ingestion_slim_build.outputs.needs_artifact_download == 'true' }}
         with:
           image: ${{ env.DATAHUB_INGESTION_IMAGE }}:${{ needs.datahub_ingestion_slim_build.outputs.tag }}
       - name: Disk Check

From 4116716a1571919224947b793c0388437ebf4b68 Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz <andrew.sikowitz@acryl.io>
Date: Wed, 23 Aug 2023 05:08:10 -0400
Subject: [PATCH 07/11] logs(ingestion/unity): Hide stack trace on sql parse
 failure logs (#8657)

---
 .../src/datahub/ingestion/source/unity/usage.py        | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py
index d5da93c7be35e..49f56b46fb012 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/unity/usage.py
@@ -176,10 +176,8 @@ def _parse_query_via_lineage_runner(self, query: str) -> Optional[StringTableInf
                     for table in runner.target_tables
                 ],
             )
-        except Exception:
-            logger.info(
-                f"Could not parse query via lineage runner, {query}", exc_info=True
-            )
+        except Exception as e:
+            logger.info(f"Could not parse query via lineage runner, {query}: {e!r}")
             return None
 
     @staticmethod
@@ -202,8 +200,8 @@ def _parse_query_via_spark_sql_plan(self, query: str) -> Optional[StringTableInf
             return GenericTableInfo(
                 source_tables=[t for t in tables if t], target_tables=[]
             )
-        except Exception:
-            logger.info(f"Could not parse query via spark plan, {query}", exc_info=True)
+        except Exception as e:
+            logger.info(f"Could not parse query via spark plan, {query}: {e!r}")
             return None
 
     @staticmethod

From 8ee58af0c249f74c93f3f8132ec9896da882a8cc Mon Sep 17 00:00:00 2001
From: siddiquebagwan-gslab <mohdsiddique.bagwan@gslab.com>
Date: Wed, 23 Aug 2023 14:38:58 +0530
Subject: [PATCH 08/11] feat(ingestion/powerbi): support multiple tables as
 upstream in native SQL parsing (#8592)

---
 .../ingestion/source/powerbi/config.py        |  15 +
 .../powerbi/dataplatform_instance_resolver.py |  14 +-
 .../powerbi/m_query/native_sql_parser.py      |  33 +-
 .../source/powerbi/m_query/parser.py          |  21 +-
 .../source/powerbi/m_query/resolver.py        | 390 +++++++++++++++---
 .../ingestion/source/powerbi/powerbi.py       |  50 +--
 .../src/datahub/ingestion/source/tableau.py   |  52 +--
 .../src/datahub/utilities/sqlglot_lineage.py  |  40 ++
 .../integration/powerbi/test_m_parser.py      | 374 +++++++++++------
 .../tableau/test_tableau_ingest.py            |   6 +-
 10 files changed, 714 insertions(+), 281 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py
index 31d067f984d2d..ffa685fb25826 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/config.py
@@ -121,6 +121,12 @@ class DataPlatformPair:
     powerbi_data_platform_name: str
 
 
+@dataclass
+class PowerBIPlatformDetail:
+    data_platform_pair: DataPlatformPair
+    data_platform_server: str
+
+
 class SupportedDataPlatform(Enum):
     POSTGRES_SQL = DataPlatformPair(
         powerbi_data_platform_name="PostgreSQL", datahub_data_platform_name="postgres"
@@ -382,6 +388,15 @@ class PowerBiDashboardSourceConfig(
         description="The instance of the platform that all assets produced by this recipe belong to",
     )
 
+    # Enable advance sql construct
+    enable_advance_lineage_sql_construct: bool = pydantic.Field(
+        default=False,
+        description="Whether to enable advance native sql construct for parsing like join, sub-queries. "
+        "along this flag , the native_query_parsing should be enabled. "
+        "By default convert_lineage_urns_to_lowercase is enabled, in-case if you have disabled it in previous ingestion execution then it may break lineage "
+        "as this option generates the upstream datasets URN in lowercase.",
+    )
+
     @validator("dataset_type_mapping")
     @classmethod
     def map_data_platform(cls, value):
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py
index 396da2d79e3b7..baaa8d5b85ae1 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/dataplatform_instance_resolver.py
@@ -5,8 +5,8 @@
 from datahub.ingestion.source.powerbi.config import (
     PlatformDetail,
     PowerBiDashboardSourceConfig,
+    PowerBIPlatformDetail,
 )
-from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable
 
 logger = logging.getLogger(__name__)
 
@@ -14,7 +14,7 @@
 class AbstractDataPlatformInstanceResolver(ABC):
     @abstractmethod
     def get_platform_instance(
-        self, dataplatform_table: DataPlatformTable
+        self, data_platform_detail: PowerBIPlatformDetail
     ) -> PlatformDetail:
         pass
 
@@ -32,10 +32,10 @@ class ResolvePlatformInstanceFromDatasetTypeMapping(
     BaseAbstractDataPlatformInstanceResolver
 ):
     def get_platform_instance(
-        self, dataplatform_table: DataPlatformTable
+        self, data_platform_detail: PowerBIPlatformDetail
     ) -> PlatformDetail:
         platform: Union[str, PlatformDetail] = self.config.dataset_type_mapping[
-            dataplatform_table.data_platform_pair.powerbi_data_platform_name
+            data_platform_detail.data_platform_pair.powerbi_data_platform_name
         ]
 
         if isinstance(platform, PlatformDetail):
@@ -48,13 +48,13 @@ class ResolvePlatformInstanceFromServerToPlatformInstance(
     BaseAbstractDataPlatformInstanceResolver
 ):
     def get_platform_instance(
-        self, dataplatform_table: DataPlatformTable
+        self, data_platform_detail: PowerBIPlatformDetail
     ) -> PlatformDetail:
         return (
             self.config.server_to_platform_instance[
-                dataplatform_table.datasource_server
+                data_platform_detail.data_platform_server
             ]
-            if dataplatform_table.datasource_server
+            if data_platform_detail.data_platform_server
             in self.config.server_to_platform_instance
             else PlatformDetail.parse_obj({})
         )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py
index 640bc4bd60d80..021c429c3c633 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/native_sql_parser.py
@@ -1,8 +1,12 @@
 import logging
-from typing import List
+from typing import List, Optional
 
 import sqlparse
 
+import datahub.utilities.sqlglot_lineage as sqlglot_l
+from datahub.ingestion.api.common import PipelineContext
+from datahub.utilities.sqlglot_lineage import SqlParsingResult
+
 SPECIAL_CHARACTERS = ["#(lf)", "(lf)"]
 
 logger = logging.getLogger()
@@ -45,3 +49,30 @@ def get_tables(native_query: str) -> List[str]:
         from_index = from_index + 1
 
     return tables
+
+
+def parse_custom_sql(
+    ctx: PipelineContext,
+    query: str,
+    schema: Optional[str],
+    database: Optional[str],
+    platform: str,
+    env: str,
+    platform_instance: Optional[str],
+) -> Optional["SqlParsingResult"]:
+
+    logger.debug("Using sqlglot_lineage to parse custom sql")
+
+    sql_query = remove_special_characters(query)
+
+    logger.debug(f"Parsing sql={sql_query}")
+
+    return sqlglot_l.create_lineage_sql_parsed_result(
+        query=sql_query,
+        schema=schema,
+        database=database,
+        platform=platform,
+        platform_instance=platform_instance,
+        env=env,
+        graph=ctx.graph,
+    )
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py
index 83106c04529d1..8cc38c366c42a 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/parser.py
@@ -6,7 +6,14 @@
 import lark
 from lark import Lark, Tree
 
-from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport
+from datahub.ingestion.api.common import PipelineContext
+from datahub.ingestion.source.powerbi.config import (
+    PowerBiDashboardSourceConfig,
+    PowerBiDashboardSourceReport,
+)
+from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
+    AbstractDataPlatformInstanceResolver,
+)
 from datahub.ingestion.source.powerbi.m_query import resolver, validator
 from datahub.ingestion.source.powerbi.m_query.data_classes import (
     TRACE_POWERBI_MQUERY_PARSER,
@@ -45,7 +52,9 @@ def _parse_expression(expression: str) -> Tree:
 def get_upstream_tables(
     table: Table,
     reporter: PowerBiDashboardSourceReport,
-    native_query_enabled: bool = True,
+    platform_instance_resolver: AbstractDataPlatformInstanceResolver,
+    ctx: PipelineContext,
+    config: PowerBiDashboardSourceConfig,
     parameters: Dict[str, str] = {},
 ) -> List[resolver.DataPlatformTable]:
     if table.expression is None:
@@ -58,7 +67,7 @@ def get_upstream_tables(
         parse_tree: Tree = _parse_expression(table.expression)
 
         valid, message = validator.validate_parse_tree(
-            parse_tree, native_query_enabled=native_query_enabled
+            parse_tree, native_query_enabled=config.native_query_parsing
         )
         if valid is False:
             assert message is not None
@@ -84,7 +93,11 @@ def get_upstream_tables(
             parse_tree=parse_tree,
             reporter=reporter,
             parameters=parameters,
-        ).resolve_to_data_platform_table_list()
+        ).resolve_to_data_platform_table_list(
+            ctx=ctx,
+            config=config,
+            platform_instance_resolver=platform_instance_resolver,
+        )
 
     except BaseException as e:
         reporter.report_warning(table.full_name, "Failed to process m-query expression")
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py
index e2b448124c89d..479f1decff903 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/m_query/resolver.py
@@ -6,11 +6,19 @@
 
 from lark import Tree
 
+import datahub.emitter.mce_builder as builder
+from datahub.ingestion.api.common import PipelineContext
 from datahub.ingestion.source.powerbi.config import (
     DataPlatformPair,
+    PlatformDetail,
+    PowerBiDashboardSourceConfig,
     PowerBiDashboardSourceReport,
+    PowerBIPlatformDetail,
     SupportedDataPlatform,
 )
+from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
+    AbstractDataPlatformInstanceResolver,
+)
 from datahub.ingestion.source.powerbi.m_query import native_sql_parser, tree_function
 from datahub.ingestion.source.powerbi.m_query.data_classes import (
     TRACE_POWERBI_MQUERY_PARSER,
@@ -19,19 +27,98 @@
     IdentifierAccessor,
 )
 from datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes import Table
+from datahub.utilities.sqlglot_lineage import SqlParsingResult
 
 logger = logging.getLogger(__name__)
 
 
 @dataclass
 class DataPlatformTable:
-    name: str
-    full_name: str
-    datasource_server: str
     data_platform_pair: DataPlatformPair
+    urn: str
+
+
+def urn_to_lowercase(value: str, flag: bool) -> str:
+    if flag is True:
+        return value.lower()
+
+    return value
+
+
+def urn_creator(
+    config: PowerBiDashboardSourceConfig,
+    platform_instance_resolver: AbstractDataPlatformInstanceResolver,
+    data_platform_pair: DataPlatformPair,
+    server: str,
+    qualified_table_name: str,
+) -> str:
+
+    platform_detail: PlatformDetail = platform_instance_resolver.get_platform_instance(
+        PowerBIPlatformDetail(
+            data_platform_pair=data_platform_pair,
+            data_platform_server=server,
+        )
+    )
+
+    return builder.make_dataset_urn_with_platform_instance(
+        platform=data_platform_pair.datahub_data_platform_name,
+        platform_instance=platform_detail.platform_instance,
+        env=platform_detail.env,
+        name=urn_to_lowercase(
+            qualified_table_name, config.convert_lineage_urns_to_lowercase
+        ),
+    )
 
 
 class AbstractDataPlatformTableCreator(ABC):
+    """
+    Base class to share common functionalities among different dataplatform for M-Query parsing.
+
+    To create qualified table name we need to parse M-Query data-access-functions(https://learn.microsoft.com/en-us/powerquery-m/accessing-data-functions) and
+    the data-access-functions has some define pattern to access database-name, schema-name and table-name, for example see below M-Query.
+
+        let
+            Source = Sql.Database("localhost", "library"),
+            dbo_book_issue = Source{[Schema="dbo",Item="book_issue"]}[Data]
+        in
+            dbo_book_issue
+
+    It is MSSQL M-Query and Sql.Database is the data-access-function to access MSSQL. If this function is available in M-Query then database name is available in second argument
+    of first statement and schema-name and table-name is available in second statement. second statement can be repeated to access different tables from MSSQL.
+
+    DefaultTwoStepDataAccessSources extends the AbstractDataPlatformTableCreator and provides the common functionalities for data-platform which has above type of M-Query pattern
+
+    data-access-function varies as per data-platform for example for MySQL.Database for MySQL, PostgreSQL.Database for Postgres and Oracle.Database for Oracle and number of statement to
+    find out database-name , schema-name and table-name also varies as per dataplatform.
+
+    Value.NativeQuery is one of the function which is used to execute native query inside M-Query, for example see below M-Query
+
+        let
+            Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true])
+        in
+            Source
+
+    In this M-Query database-name is available in first argument and rest of the detail i.e database & schema is available in native query.
+
+    NativeQueryDataPlatformTableCreator extends AbstractDataPlatformTableCreator to support Redshift and Snowflake native query parsing.
+
+    """
+
+    ctx: PipelineContext
+    config: PowerBiDashboardSourceConfig
+    platform_instance_resolver: AbstractDataPlatformInstanceResolver
+
+    def __init__(
+        self,
+        ctx: PipelineContext,
+        config: PowerBiDashboardSourceConfig,
+        platform_instance_resolver: AbstractDataPlatformInstanceResolver,
+    ) -> None:
+        super().__init__()
+        self.ctx = ctx
+        self.config = config
+        self.platform_instance_resolver = platform_instance_resolver
+
     @abstractmethod
     def create_dataplatform_tables(
         self, data_access_func_detail: DataAccessFunctionDetail
@@ -58,6 +145,49 @@ def get_db_detail_from_argument(
 
         return arguments[0], arguments[1]
 
+    def parse_custom_sql(
+        self, query: str, server: str, database: Optional[str], schema: Optional[str]
+    ) -> List[DataPlatformTable]:
+
+        dataplatform_tables: List[DataPlatformTable] = []
+
+        platform_detail: PlatformDetail = (
+            self.platform_instance_resolver.get_platform_instance(
+                PowerBIPlatformDetail(
+                    data_platform_pair=self.get_platform_pair(),
+                    data_platform_server=server,
+                )
+            )
+        )
+
+        parsed_result: Optional[
+            "SqlParsingResult"
+        ] = native_sql_parser.parse_custom_sql(
+            ctx=self.ctx,
+            query=query,
+            platform=self.get_platform_pair().datahub_data_platform_name,
+            platform_instance=platform_detail.platform_instance,
+            env=platform_detail.env,
+            database=database,
+            schema=schema,
+        )
+
+        if parsed_result is None:
+            logger.debug("Failed to parse query")
+            return dataplatform_tables
+
+        for urn in parsed_result.in_tables:
+            dataplatform_tables.append(
+                DataPlatformTable(
+                    data_platform_pair=self.get_platform_pair(),
+                    urn=urn,
+                )
+            )
+
+        logger.debug(f"Generated dataplatform_tables={dataplatform_tables}")
+
+        return dataplatform_tables
+
 
 class AbstractDataAccessMQueryResolver(ABC):
     table: Table
@@ -80,11 +210,29 @@ def __init__(
         self.data_access_functions = SupportedResolver.get_function_names()
 
     @abstractmethod
-    def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]:
+    def resolve_to_data_platform_table_list(
+        self,
+        ctx: PipelineContext,
+        config: PowerBiDashboardSourceConfig,
+        platform_instance_resolver: AbstractDataPlatformInstanceResolver,
+    ) -> List[DataPlatformTable]:
         pass
 
 
 class MQueryResolver(AbstractDataAccessMQueryResolver, ABC):
+    """
+    This class parses the M-Query recursively to generate DataAccessFunctionDetail (see method create_data_access_functional_detail).
+
+    This class has generic code to process M-Query tokens and create instance of DataAccessFunctionDetail.
+
+    Once DataAccessFunctionDetail instance is initialized thereafter MQueryResolver generates the DataPlatformTable with the help of AbstractDataPlatformTableCreator
+    (see method resolve_to_data_platform_table_list).
+
+    Classes which extended from AbstractDataPlatformTableCreator knows how to convert generated DataAccessFunctionDetail instance
+    to respective DataPlatformTable instance as per dataplatform.
+
+    """
+
     def get_item_selector_tokens(
         self,
         expression_tree: Tree,
@@ -318,9 +466,15 @@ def internal(
 
         return table_links
 
-    def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]:
+    def resolve_to_data_platform_table_list(
+        self,
+        ctx: PipelineContext,
+        config: PowerBiDashboardSourceConfig,
+        platform_instance_resolver: AbstractDataPlatformInstanceResolver,
+    ) -> List[DataPlatformTable]:
         data_platform_tables: List[DataPlatformTable] = []
 
+        # Find out output variable as we are doing backtracking in M-Query
         output_variable: Optional[str] = tree_function.get_output_variable(
             self.parse_tree
         )
@@ -332,12 +486,14 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]:
             )
             return data_platform_tables
 
+        # Parse M-Query and use output_variable as root of tree and create instance of DataAccessFunctionDetail
         table_links: List[
             DataAccessFunctionDetail
         ] = self.create_data_access_functional_detail(output_variable)
 
         # Each item is data-access function
         for f_detail in table_links:
+            # Get & Check if we support data-access-function available in M-Query
             supported_resolver = SupportedResolver.get_resolver(
                 f_detail.data_access_function_name
             )
@@ -351,8 +507,14 @@ def resolve_to_data_platform_table_list(self) -> List[DataPlatformTable]:
                 )
                 continue
 
+            # From supported_resolver enum get respective resolver like AmazonRedshift or Snowflake or Oracle or NativeQuery and create instance of it
+            # & also pass additional information that will be need to generate urn
             table_full_name_creator: AbstractDataPlatformTableCreator = (
-                supported_resolver.get_table_full_name_creator()()
+                supported_resolver.get_table_full_name_creator()(
+                    ctx=ctx,
+                    config=config,
+                    platform_instance_resolver=platform_instance_resolver,
+                )
             )
 
             data_platform_tables.extend(
@@ -393,18 +555,24 @@ def two_level_access_pattern(
             IdentifierAccessor, data_access_func_detail.identifier_accessor
         ).items["Item"]
 
-        full_table_name: str = f"{db_name}.{schema_name}.{table_name}"
+        qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
 
         logger.debug(
-            f"Platform({self.get_platform_pair().datahub_data_platform_name}) full_table_name= {full_table_name}"
+            f"Platform({self.get_platform_pair().datahub_data_platform_name}) qualified_table_name= {qualified_table_name}"
+        )
+
+        urn = urn_creator(
+            config=self.config,
+            platform_instance_resolver=self.platform_instance_resolver,
+            data_platform_pair=self.get_platform_pair(),
+            server=server,
+            qualified_table_name=qualified_table_name,
         )
 
         return [
             DataPlatformTable(
-                name=table_name,
-                full_name=full_table_name,
-                datasource_server=server,
                 data_platform_pair=self.get_platform_pair(),
+                urn=urn,
             )
         ]
 
@@ -420,9 +588,48 @@ def get_platform_pair(self) -> DataPlatformPair:
 
 
 class MSSqlDataPlatformTableCreator(DefaultTwoStepDataAccessSources):
+    # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16
+    DEFAULT_SCHEMA = "dbo"  # Default schema name in MS-SQL is dbo
+
     def get_platform_pair(self) -> DataPlatformPair:
         return SupportedDataPlatform.MS_SQL.value
 
+    def create_urn_using_old_parser(
+        self, query: str, db_name: str, server: str
+    ) -> List[DataPlatformTable]:
+        dataplatform_tables: List[DataPlatformTable] = []
+
+        tables: List[str] = native_sql_parser.get_tables(query)
+
+        for table in tables:
+            schema_and_table: List[str] = table.split(".")
+            if len(schema_and_table) == 1:
+                # schema name is not present. set default schema
+                schema_and_table.insert(0, MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA)
+
+            qualified_table_name = (
+                f"{db_name}.{schema_and_table[0]}.{schema_and_table[1]}"
+            )
+
+            urn = urn_creator(
+                config=self.config,
+                platform_instance_resolver=self.platform_instance_resolver,
+                data_platform_pair=self.get_platform_pair(),
+                server=server,
+                qualified_table_name=qualified_table_name,
+            )
+
+            dataplatform_tables.append(
+                DataPlatformTable(
+                    data_platform_pair=self.get_platform_pair(),
+                    urn=urn,
+                )
+            )
+
+        logger.debug(f"Generated upstream tables = {dataplatform_tables}")
+
+        return dataplatform_tables
+
     def create_dataplatform_tables(
         self, data_access_func_detail: DataAccessFunctionDetail
     ) -> List[DataPlatformTable]:
@@ -442,28 +649,20 @@ def create_dataplatform_tables(
             logger.debug("Unsupported case is found. Second index is not the Query")
             return dataplatform_tables
 
-        db_name: str = arguments[1]
-
-        tables: List[str] = native_sql_parser.get_tables(arguments[3])
-        for table in tables:
-            schema_and_table: List[str] = table.split(".")
-            if len(schema_and_table) == 1:
-                # schema name is not present. Default schema name in MS-SQL is dbo
-                # https://learn.microsoft.com/en-us/sql/relational-databases/security/authentication-access/ownership-and-user-schema-separation?view=sql-server-ver16
-                schema_and_table.insert(0, "dbo")
-
-            dataplatform_tables.append(
-                DataPlatformTable(
-                    name=schema_and_table[1],
-                    full_name=f"{db_name}.{schema_and_table[0]}.{schema_and_table[1]}",
-                    datasource_server=arguments[0],
-                    data_platform_pair=self.get_platform_pair(),
-                )
+        if self.config.enable_advance_lineage_sql_construct is False:
+            # Use previous parser to generate URN to keep backward compatibility
+            return self.create_urn_using_old_parser(
+                query=arguments[3],
+                db_name=arguments[1],
+                server=arguments[0],
             )
 
-        logger.debug("MS-SQL full-table-names %s", dataplatform_tables)
-
-        return dataplatform_tables
+        return self.parse_custom_sql(
+            query=arguments[3],
+            database=arguments[1],
+            server=arguments[0],
+            schema=MSSqlDataPlatformTableCreator.DEFAULT_SCHEMA,
+        )
 
 
 class OracleDataPlatformTableCreator(AbstractDataPlatformTableCreator):
@@ -510,12 +709,20 @@ def create_dataplatform_tables(
             cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next,
         ).items["Name"]
 
+        qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
+
+        urn = urn_creator(
+            config=self.config,
+            platform_instance_resolver=self.platform_instance_resolver,
+            data_platform_pair=self.get_platform_pair(),
+            server=server,
+            qualified_table_name=qualified_table_name,
+        )
+
         return [
             DataPlatformTable(
-                name=table_name,
-                full_name=f"{db_name}.{schema_name}.{table_name}",
-                datasource_server=server,
                 data_platform_pair=self.get_platform_pair(),
+                urn=urn,
             )
         ]
 
@@ -547,14 +754,28 @@ def create_dataplatform_tables(
         db_name: str = value_dict["Database"]
         schema_name: str = value_dict["Schema"]
         table_name: str = value_dict["Table"]
+
+        qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
+
         server, _ = self.get_db_detail_from_argument(data_access_func_detail.arg_list)
+        if server is None:
+            logger.info(
+                f"server information is not available for {qualified_table_name}. Skipping upstream table"
+            )
+            return []
+
+        urn = urn_creator(
+            config=self.config,
+            platform_instance_resolver=self.platform_instance_resolver,
+            data_platform_pair=self.get_platform_pair(),
+            server=server,
+            qualified_table_name=qualified_table_name,
+        )
 
         return [
             DataPlatformTable(
-                name=table_name,
-                full_name=f"{db_name}.{schema_name}.{table_name}",
-                datasource_server=server if server else "",
                 data_platform_pair=self.get_platform_pair(),
+                urn=urn,
             )
         ]
 
@@ -589,20 +810,26 @@ def create_dataplatform_tables(
             IdentifierAccessor, data_access_func_detail.identifier_accessor.next.next  # type: ignore
         ).items["Name"]
 
-        full_table_name: str = f"{db_name}.{schema_name}.{table_name}"
+        qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
 
         logger.debug(
-            f"{self.get_platform_pair().datahub_data_platform_name} full-table-name {full_table_name}"
+            f"{self.get_platform_pair().datahub_data_platform_name} qualified_table_name {qualified_table_name}"
+        )
+
+        server: str = self.get_datasource_server(arguments, data_access_func_detail)
+
+        urn = urn_creator(
+            config=self.config,
+            platform_instance_resolver=self.platform_instance_resolver,
+            data_platform_pair=self.get_platform_pair(),
+            server=server,
+            qualified_table_name=qualified_table_name,
         )
 
         return [
             DataPlatformTable(
-                name=table_name,
-                full_name=full_table_name,
-                datasource_server=self.get_datasource_server(
-                    arguments, data_access_func_detail
-                ),
                 data_platform_pair=self.get_platform_pair(),
+                urn=urn,
             )
         ]
 
@@ -654,12 +881,20 @@ def create_dataplatform_tables(
             cast(IdentifierAccessor, data_access_func_detail.identifier_accessor).next,
         ).items["Name"]
 
+        qualified_table_name: str = f"{db_name}.{schema_name}.{table_name}"
+
+        urn = urn_creator(
+            config=self.config,
+            platform_instance_resolver=self.platform_instance_resolver,
+            data_platform_pair=self.get_platform_pair(),
+            server=server,
+            qualified_table_name=qualified_table_name,
+        )
+
         return [
             DataPlatformTable(
-                name=table_name,
-                full_name=f"{db_name}.{schema_name}.{table_name}",
-                datasource_server=server,
                 data_platform_pair=self.get_platform_pair(),
+                urn=urn,
             )
         ]
 
@@ -681,6 +916,39 @@ def is_native_parsing_supported(data_access_function_name: str) -> bool:
             in NativeQueryDataPlatformTableCreator.SUPPORTED_NATIVE_QUERY_DATA_PLATFORM
         )
 
+    def create_urn_using_old_parser(
+        self, query: str, server: str
+    ) -> List[DataPlatformTable]:
+        dataplatform_tables: List[DataPlatformTable] = []
+
+        tables: List[str] = native_sql_parser.get_tables(query)
+
+        for qualified_table_name in tables:
+            if len(qualified_table_name.split(".")) != 3:
+                logger.debug(
+                    f"Skipping table {qualified_table_name} as it is not as per qualified_table_name format"
+                )
+                continue
+
+            urn = urn_creator(
+                config=self.config,
+                platform_instance_resolver=self.platform_instance_resolver,
+                data_platform_pair=self.get_platform_pair(),
+                server=server,
+                qualified_table_name=qualified_table_name,
+            )
+
+            dataplatform_tables.append(
+                DataPlatformTable(
+                    data_platform_pair=self.get_platform_pair(),
+                    urn=urn,
+                )
+            )
+
+        logger.debug(f"Generated dataplatform_tables {dataplatform_tables}")
+
+        return dataplatform_tables
+
     def create_dataplatform_tables(
         self, data_access_func_detail: DataAccessFunctionDetail
     ) -> List[DataPlatformTable]:
@@ -727,25 +995,21 @@ def create_dataplatform_tables(
             0
         ]  # Remove any whitespaces and double quotes character
 
-        for table in native_sql_parser.get_tables(sql_query):
-            if len(table.split(".")) != 3:
-                logger.debug(
-                    f"Skipping table {table} as it is not as per full_table_name format"
-                )
-                continue
+        server = tree_function.strip_char_from_list([data_access_tokens[2]])[0]
 
-            dataplatform_tables.append(
-                DataPlatformTable(
-                    name=table.split(".")[2],
-                    full_name=table,
-                    datasource_server=tree_function.strip_char_from_list(
-                        [data_access_tokens[2]]
-                    )[0],
-                    data_platform_pair=self.get_platform_pair(),
-                )
+        if self.config.enable_advance_lineage_sql_construct is False:
+            # Use previous parser to generate URN to keep backward compatibility
+            return self.create_urn_using_old_parser(
+                query=sql_query,
+                server=server,
             )
 
-        return dataplatform_tables
+        return self.parse_custom_sql(
+            query=sql_query,
+            server=server,
+            database=None,  # database and schema is available inside custom sql as per PowerBI Behavior
+            schema=None,
+        )
 
 
 class FunctionName(Enum):
diff --git a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py
index 919cb83e4d832..5d477ee090e7e 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/powerbi/powerbi.py
@@ -28,7 +28,6 @@
 )
 from datahub.ingestion.source.powerbi.config import (
     Constant,
-    PlatformDetail,
     PowerBiDashboardSourceConfig,
     PowerBiDashboardSourceReport,
 )
@@ -96,10 +95,12 @@ def __hash__(self):
 
     def __init__(
         self,
+        ctx: PipelineContext,
         config: PowerBiDashboardSourceConfig,
         reporter: PowerBiDashboardSourceReport,
         dataplatform_instance_resolver: AbstractDataPlatformInstanceResolver,
     ):
+        self.__ctx = ctx
         self.__config = config
         self.__reporter = reporter
         self.__dataplatform_instance_resolver = dataplatform_instance_resolver
@@ -172,43 +173,40 @@ def extract_lineage(
         # table.dataset should always be set, but we check it just in case.
         parameters = table.dataset.parameters if table.dataset else {}
 
-        upstreams: List[UpstreamClass] = []
-        upstream_tables: List[resolver.DataPlatformTable] = parser.get_upstream_tables(
-            table, self.__reporter, parameters=parameters
+        upstream: List[UpstreamClass] = []
+
+        upstream_dpts: List[resolver.DataPlatformTable] = parser.get_upstream_tables(
+            table=table,
+            reporter=self.__reporter,
+            platform_instance_resolver=self.__dataplatform_instance_resolver,
+            ctx=self.__ctx,
+            config=self.__config,
+            parameters=parameters,
         )
+
         logger.debug(
-            f"PowerBI virtual table {table.full_name} and it's upstream dataplatform tables = {upstream_tables}"
+            f"PowerBI virtual table {table.full_name} and it's upstream dataplatform tables = {upstream_dpts}"
         )
-        for upstream_table in upstream_tables:
+
+        for upstream_dpt in upstream_dpts:
             if (
-                upstream_table.data_platform_pair.powerbi_data_platform_name
+                upstream_dpt.data_platform_pair.powerbi_data_platform_name
                 not in self.__config.dataset_type_mapping.keys()
             ):
                 logger.debug(
-                    f"Skipping upstream table for {ds_urn}. The platform {upstream_table.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping",
+                    f"Skipping upstream table for {ds_urn}. The platform {upstream_dpt.data_platform_pair.powerbi_data_platform_name} is not part of dataset_type_mapping",
                 )
                 continue
 
-            platform_detail: PlatformDetail = (
-                self.__dataplatform_instance_resolver.get_platform_instance(
-                    upstream_table
-                )
-            )
-            upstream_urn = builder.make_dataset_urn_with_platform_instance(
-                platform=upstream_table.data_platform_pair.datahub_data_platform_name,
-                platform_instance=platform_detail.platform_instance,
-                env=platform_detail.env,
-                name=self.lineage_urn_to_lowercase(upstream_table.full_name),
-            )
-
             upstream_table_class = UpstreamClass(
-                upstream_urn,
+                upstream_dpt.urn,
                 DatasetLineageTypeClass.TRANSFORMED,
             )
-            upstreams.append(upstream_table_class)
 
-        if len(upstreams) > 0:
-            upstream_lineage = UpstreamLineageClass(upstreams=upstreams)
+            upstream.append(upstream_table_class)
+
+        if len(upstream) > 0:
+            upstream_lineage = UpstreamLineageClass(upstreams=upstream)
             logger.debug(f"Dataset urn = {ds_urn} and its lineage = {upstream_lineage}")
             mcp = MetadataChangeProposalWrapper(
                 entityType=Constant.DATASET,
@@ -1107,7 +1105,9 @@ def __init__(self, config: PowerBiDashboardSourceConfig, ctx: PipelineContext):
             )  # Exit pipeline as we are not able to connect to PowerBI API Service. This exit will avoid raising
             # unwanted stacktrace on console
 
-        self.mapper = Mapper(config, self.reporter, self.dataplatform_instance_resolver)
+        self.mapper = Mapper(
+            ctx, config, self.reporter, self.dataplatform_instance_resolver
+        )
 
         # Create and register the stateful ingestion use-case handler.
         self.stale_entity_removal_handler = StaleEntityRemovalHandler.create(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/tableau.py b/metadata-ingestion/src/datahub/ingestion/source/tableau.py
index 6752bdf519830..ec0af37089b1d 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/tableau.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/tableau.py
@@ -31,6 +31,7 @@
 from tableauserverclient.server.endpoint.exceptions import NonXMLResponseError
 
 import datahub.emitter.mce_builder as builder
+import datahub.utilities.sqlglot_lineage as sqlglot_l
 from datahub.configuration.common import (
     AllowDenyPattern,
     ConfigModel,
@@ -136,12 +137,7 @@
     ViewPropertiesClass,
 )
 from datahub.utilities import config_clean
-from datahub.utilities.sqlglot_lineage import (
-    ColumnLineageInfo,
-    SchemaResolver,
-    SqlParsingResult,
-    sqlglot_lineage,
-)
+from datahub.utilities.sqlglot_lineage import ColumnLineageInfo, SqlParsingResult
 
 logger: logging.Logger = logging.getLogger(__name__)
 
@@ -1585,42 +1581,14 @@ def parse_custom_sql(
             f"Overridden info upstream_db={upstream_db}, platform_instance={platform_instance}, platform={platform}"
         )
 
-        parsed_result: Optional["SqlParsingResult"] = None
-        try:
-            schema_resolver = (
-                self.ctx.graph._make_schema_resolver(
-                    platform=platform,
-                    platform_instance=platform_instance,
-                    env=env,
-                )
-                if self.ctx.graph is not None
-                else SchemaResolver(
-                    platform=platform,
-                    platform_instance=platform_instance,
-                    env=env,
-                    graph=None,
-                )
-            )
-
-            if schema_resolver.graph is None:
-                logger.warning(
-                    "Column Level Lineage extraction would not work as DataHub graph client is None."
-                )
-
-            parsed_result = sqlglot_lineage(
-                query,
-                schema_resolver=schema_resolver,
-                default_db=upstream_db,
-            )
-        except Exception as e:
-            self.report.report_warning(
-                key="csql-lineage",
-                reason=f"Unable to retrieve lineage from query. "
-                f"Query: {query} "
-                f"Reason: {str(e)} ",
-            )
-
-        return parsed_result
+        return sqlglot_l.create_lineage_sql_parsed_result(
+            query=query,
+            database=upstream_db,
+            platform=platform,
+            platform_instance=platform_instance,
+            env=env,
+            graph=self.ctx.graph,
+        )
 
     def _create_lineage_from_unsupported_csql(
         self, csql_urn: str, csql: dict
diff --git a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
index e5a9954802019..6d028c4ac1b9e 100644
--- a/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
+++ b/metadata-ingestion/src/datahub/utilities/sqlglot_lineage.py
@@ -825,3 +825,43 @@ def sqlglot_lineage(
                 table_error=e,
             ),
         )
+
+
+def create_lineage_sql_parsed_result(
+    query: str,
+    database: Optional[str],
+    platform: str,
+    platform_instance: Optional[str],
+    env: str,
+    schema: Optional[str] = None,
+    graph: Optional[DataHubGraph] = None,
+) -> Optional["SqlParsingResult"]:
+
+    parsed_result: Optional["SqlParsingResult"] = None
+    try:
+        schema_resolver = (
+            graph._make_schema_resolver(
+                platform=platform,
+                platform_instance=platform_instance,
+                env=env,
+            )
+            if graph is not None
+            else SchemaResolver(
+                platform=platform,
+                platform_instance=platform_instance,
+                env=env,
+                graph=None,
+            )
+        )
+
+        parsed_result = sqlglot_lineage(
+            query,
+            schema_resolver=schema_resolver,
+            default_db=database,
+            default_schema=schema,
+        )
+    except Exception as e:
+        logger.debug(f"Fail to prase query {query}", exc_info=e)
+        logger.warning("Fail to parse custom SQL")
+
+    return parsed_result
diff --git a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py
index 5c9553402a8c4..e77a12aa4088e 100644
--- a/metadata-ingestion/tests/integration/powerbi/test_m_parser.py
+++ b/metadata-ingestion/tests/integration/powerbi/test_m_parser.py
@@ -1,17 +1,22 @@
 import logging
 import sys
-from typing import List
+from typing import List, Tuple
 
 import pytest
 from lark import Tree
 
 import datahub.ingestion.source.powerbi.rest_api_wrapper.data_classes as powerbi_data_classes
-from datahub.ingestion.source.powerbi.config import PowerBiDashboardSourceReport
-from datahub.ingestion.source.powerbi.m_query import parser, tree_function
-from datahub.ingestion.source.powerbi.m_query.resolver import (
-    DataPlatformTable,
-    SupportedDataPlatform,
+from datahub.ingestion.api.common import PipelineContext
+from datahub.ingestion.source.powerbi.config import (
+    PowerBiDashboardSourceConfig,
+    PowerBiDashboardSourceReport,
+)
+from datahub.ingestion.source.powerbi.dataplatform_instance_resolver import (
+    AbstractDataPlatformInstanceResolver,
+    create_dataplatform_instance_resolver,
 )
+from datahub.ingestion.source.powerbi.m_query import parser, tree_function
+from datahub.ingestion.source.powerbi.m_query.resolver import DataPlatformTable
 
 M_QUERIES = [
     'let\n    Source = Snowflake.Databases("bu10758.ap-unknown-2.fakecomputing.com","PBI_TEST_WAREHOUSE_PROD",[Role="PBI_TEST_MEMBER"]),\n    PBI_TEST_Database = Source{[Name="PBI_TEST",Kind="Database"]}[Data],\n    TEST_Schema = PBI_TEST_Database{[Name="TEST",Kind="Schema"]}[Data],\n    TESTTABLE_Table = TEST_Schema{[Name="TESTTABLE",Kind="Table"]}[Data]\nin\n    TESTTABLE_Table',
@@ -38,9 +43,31 @@
     'let\n    Source = AmazonRedshift.Database("redshift-url","dev"),\n    public = Source{[Name="public"]}[Data],\n    category1 = public{[Name="category"]}[Data]\nin\n    category1',
     'let\n Source = Value.NativeQuery(AmazonRedshift.Database("redshift-url","dev"), "select * from dev.public.category", null, [EnableFolding=true]) \n in Source',
     'let\n    Source = Databricks.Catalogs("adb-123.azuredatabricks.net", "/sql/1.0/endpoints/12345dc91aa25844", [Catalog=null, Database=null]),\n    hive_metastore_Database = Source{[Name="hive_metastore",Kind="Database"]}[Data],\n    sandbox_revenue_Schema = hive_metastore_Database{[Name="sandbox_revenue",Kind="Schema"]}[Data],\n    public_consumer_price_index_Table = sandbox_revenue_Schema{[Name="public_consumer_price_index",Kind="Table"]}[Data],\n    #"Renamed Columns" = Table.RenameColumns(public_consumer_price_index_Table,{{"Country", "country"}, {"Metric", "metric"}}),\n #"Inserted Year" = Table.AddColumn(#"Renamed Columns", "ID", each Date.Year([date_id]) + Date.Month([date_id]), Text.Type),\n #"Added Custom" = Table.AddColumn(#"Inserted Year", "Custom", each Text.Combine({Number.ToText(Date.Year([date_id])), Number.ToText(Date.Month([date_id])), [country]})),\n    #"Removed Columns" = Table.RemoveColumns(#"Added Custom",{"ID"}),\n    #"Renamed Columns1" = Table.RenameColumns(#"Removed Columns",{{"Custom", "ID"}}),\n #"Filtered Rows" = Table.SelectRows(#"Renamed Columns1", each ([metric] = "Consumer Price Index") and (not Number.IsNaN([value])))\nin\n    #"Filtered Rows"',
+    "let\n    Source = Value.NativeQuery(Snowflake.Databases(\"bu10758.ap-unknown-2.fakecomputing.com\",\"operations_analytics_warehouse_prod\",[Role=\"OPERATIONS_ANALYTICS_MEMBER\"]){[Name=\"OPERATIONS_ANALYTICS\"]}[Data], \"select #(lf)UPPER(REPLACE(AGENT_NAME,'-','')) AS CLIENT_DIRECTOR,#(lf)TIER,#(lf)UPPER(MANAGER),#(lf)TEAM_TYPE,#(lf)DATE_TARGET,#(lf)MONTHID,#(lf)TARGET_TEAM,#(lf)SELLER_EMAIL,#(lf)concat((UPPER(REPLACE(AGENT_NAME,'-',''))), MONTHID) as AGENT_KEY,#(lf)UNIT_TARGET AS SME_Quota,#(lf)AMV_TARGET AS Revenue_Quota,#(lf)SERVICE_QUOTA,#(lf)BL_TARGET,#(lf)SOFTWARE_QUOTA as Software_Quota#(lf)#(lf)from OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS inner join OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT #(lf)#(lf)where YEAR_TARGET >= 2022#(lf)and TEAM_TYPE = 'Accounting'#(lf)and TARGET_TEAM = 'Enterprise'#(lf)AND TIER = 'Client Director'\", null, [EnableFolding=true])\nin\n    Source",
 ]
 
 
+def get_default_instances(
+    override_config: dict = {},
+) -> Tuple[
+    PipelineContext, PowerBiDashboardSourceConfig, AbstractDataPlatformInstanceResolver
+]:
+    config: PowerBiDashboardSourceConfig = PowerBiDashboardSourceConfig.parse_obj(
+        {
+            "tenant_id": "fake",
+            "client_id": "foo",
+            "client_secret": "bar",
+            **override_config,
+        }
+    )
+
+    platform_instance_resolver: AbstractDataPlatformInstanceResolver = (
+        create_dataplatform_instance_resolver(config)
+    )
+
+    return PipelineContext(run_id="fake"), config, platform_instance_resolver
+
+
 @pytest.mark.integration
 def test_parse_m_query1():
     expression: str = M_QUERIES[0]
@@ -145,20 +172,20 @@ def test_snowflake_regular_case():
 
     reporter = PowerBiDashboardSourceReport()
 
+    ctx, config, platform_instance_resolver = get_default_instances()
+
     data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
-        table, reporter
+        table,
+        reporter,
+        ctx=ctx,
+        config=config,
+        platform_instance_resolver=platform_instance_resolver,
     )
 
     assert len(data_platform_tables) == 1
-    assert data_platform_tables[0].name == "TESTTABLE"
-    assert data_platform_tables[0].full_name == "PBI_TEST.TEST.TESTTABLE"
     assert (
-        data_platform_tables[0].datasource_server
-        == "bu10758.ap-unknown-2.fakecomputing.com"
-    )
-    assert (
-        data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
-        == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name
+        data_platform_tables[0].urn
+        == "urn:li:dataset:(urn:li:dataPlatform:snowflake,pbi_test.test.testtable,PROD)"
     )
 
 
@@ -174,17 +201,21 @@ def test_postgres_regular_case():
     )
 
     reporter = PowerBiDashboardSourceReport()
+
+    ctx, config, platform_instance_resolver = get_default_instances()
+
     data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
-        table, reporter
+        table,
+        reporter,
+        ctx=ctx,
+        config=config,
+        platform_instance_resolver=platform_instance_resolver,
     )
 
     assert len(data_platform_tables) == 1
-    assert data_platform_tables[0].name == "order_date"
-    assert data_platform_tables[0].full_name == "mics.public.order_date"
-    assert data_platform_tables[0].datasource_server == "localhost"
     assert (
-        data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
-        == SupportedDataPlatform.POSTGRES_SQL.value.powerbi_data_platform_name
+        data_platform_tables[0].urn
+        == "urn:li:dataset:(urn:li:dataPlatform:postgres,mics.public.order_date,PROD)"
     )
 
 
@@ -200,19 +231,21 @@ def test_databricks_regular_case():
     )
 
     reporter = PowerBiDashboardSourceReport()
+
+    ctx, config, platform_instance_resolver = get_default_instances()
+
     data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
-        table, reporter
+        table,
+        reporter,
+        ctx=ctx,
+        config=config,
+        platform_instance_resolver=platform_instance_resolver,
     )
 
     assert len(data_platform_tables) == 1
-    assert data_platform_tables[0].name == "public_consumer_price_index"
     assert (
-        data_platform_tables[0].full_name
-        == "hive_metastore.sandbox_revenue.public_consumer_price_index"
-    )
-    assert (
-        data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
-        == SupportedDataPlatform.DATABRICK_SQL.value.powerbi_data_platform_name
+        data_platform_tables[0].urn
+        == "urn:li:dataset:(urn:li:dataPlatform:databricks,hive_metastore.sandbox_revenue.public_consumer_price_index,PROD)"
     )
 
 
@@ -228,17 +261,21 @@ def test_oracle_regular_case():
     )
 
     reporter = PowerBiDashboardSourceReport()
+
+    ctx, config, platform_instance_resolver = get_default_instances()
+
     data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
-        table, reporter
+        table,
+        reporter,
+        ctx=ctx,
+        config=config,
+        platform_instance_resolver=platform_instance_resolver,
     )
 
     assert len(data_platform_tables) == 1
-    assert data_platform_tables[0].name == "EMPLOYEES"
-    assert data_platform_tables[0].full_name == "salesdb.HR.EMPLOYEES"
-    assert data_platform_tables[0].datasource_server == "localhost:1521"
     assert (
-        data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
-        == SupportedDataPlatform.ORACLE.value.powerbi_data_platform_name
+        data_platform_tables[0].urn
+        == "urn:li:dataset:(urn:li:dataPlatform:oracle,salesdb.hr.employees,PROD)"
     )
 
 
@@ -255,17 +292,20 @@ def test_mssql_regular_case():
 
     reporter = PowerBiDashboardSourceReport()
 
+    ctx, config, platform_instance_resolver = get_default_instances()
+
     data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
-        table, reporter
+        table,
+        reporter,
+        ctx=ctx,
+        config=config,
+        platform_instance_resolver=platform_instance_resolver,
     )
 
     assert len(data_platform_tables) == 1
-    assert data_platform_tables[0].name == "book_issue"
-    assert data_platform_tables[0].full_name == "library.dbo.book_issue"
-    assert data_platform_tables[0].datasource_server == "localhost"
     assert (
-        data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
-        == SupportedDataPlatform.MS_SQL.value.powerbi_data_platform_name
+        data_platform_tables[0].urn
+        == "urn:li:dataset:(urn:li:dataPlatform:mssql,library.dbo.book_issue,PROD)"
     )
 
 
@@ -280,14 +320,16 @@ def test_mssql_with_query():
         M_QUERIES[11],
     ]
     expected_tables = [
-        "COMMOPSDB.dbo.V_OIP_ENT_2022",
-        "COMMOPSDB.dbo.V_INVOICE_BOOKING_2022",
-        "COMMOPSDB.dbo.V_ARR_ADDS",
-        "COMMOPSDB.dbo.V_PS_CD_RETENTION",
-        "COMMOPSDB.dbo.V_TPV_LEADERBOARD",
-        "COMMOPSDB.dbo.V_ENTERPRISE_INVOICED_REVENUE",
+        "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_oip_ent_2022,PROD)",
+        "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_invoice_booking_2022,PROD)",
+        "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_arr_adds,PROD)",
+        "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_ps_cd_retention,PROD)",
+        "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_tpv_leaderboard,PROD)",
+        "urn:li:dataset:(urn:li:dataPlatform:mssql,commopsdb.dbo.v_enterprise_invoiced_revenue,PROD)",
     ]
 
+    ctx, config, platform_instance_resolver = get_default_instances()
+
     for index, query in enumerate(mssql_queries):
         table: powerbi_data_classes.Table = powerbi_data_classes.Table(
             columns=[],
@@ -299,17 +341,15 @@ def test_mssql_with_query():
         reporter = PowerBiDashboardSourceReport()
 
         data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
-            table, reporter, native_query_enabled=False
+            table,
+            reporter,
+            ctx=ctx,
+            config=config,
+            platform_instance_resolver=platform_instance_resolver,
         )
 
         assert len(data_platform_tables) == 1
-        assert data_platform_tables[0].name == expected_tables[index].split(".")[2]
-        assert data_platform_tables[0].full_name == expected_tables[index]
-        assert data_platform_tables[0].datasource_server == "AUPRDWHDB"
-        assert (
-            data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
-            == SupportedDataPlatform.MS_SQL.value.powerbi_data_platform_name
-        )
+        assert data_platform_tables[0].urn == expected_tables[index]
 
 
 @pytest.mark.integration
@@ -322,12 +362,14 @@ def test_snowflake_native_query():
     ]
 
     expected_tables = [
-        "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_APS_SME_UNITS_V4",
-        "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS",
-        "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS",
-        "OPERATIONS_ANALYTICS.TRANSFORMED_PROD.V_SME_UNIT_TARGETS",
+        "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_aps_sme_units_v4,PROD)",
+        "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_sme_unit_targets,PROD)",
+        "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_sme_unit_targets,PROD)",
+        "urn:li:dataset:(urn:li:dataPlatform:snowflake,operations_analytics.transformed_prod.v_sme_unit_targets,PROD)",
     ]
 
+    ctx, config, platform_instance_resolver = get_default_instances()
+
     for index, query in enumerate(snowflake_queries):
         table: powerbi_data_classes.Table = powerbi_data_classes.Table(
             columns=[],
@@ -339,20 +381,15 @@ def test_snowflake_native_query():
         reporter = PowerBiDashboardSourceReport()
 
         data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
-            table, reporter
+            table,
+            reporter,
+            ctx=ctx,
+            config=config,
+            platform_instance_resolver=platform_instance_resolver,
         )
 
         assert len(data_platform_tables) == 1
-        assert data_platform_tables[0].name == expected_tables[index].split(".")[2]
-        assert data_platform_tables[0].full_name == expected_tables[index]
-        assert (
-            data_platform_tables[0].datasource_server
-            == "bu10758.ap-unknown-2.fakecomputing.com"
-        )
-        assert (
-            data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
-            == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name
-        )
+        assert data_platform_tables[0].urn == expected_tables[index]
 
 
 def test_google_bigquery_1():
@@ -363,16 +400,20 @@ def test_google_bigquery_1():
     )
     reporter = PowerBiDashboardSourceReport()
 
+    ctx, config, platform_instance_resolver = get_default_instances()
+
     data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
-        table, reporter, native_query_enabled=False
+        table,
+        reporter,
+        ctx=ctx,
+        config=config,
+        platform_instance_resolver=platform_instance_resolver,
     )
+
     assert len(data_platform_tables) == 1
-    assert data_platform_tables[0].name == table.full_name.split(".")[2]
-    assert data_platform_tables[0].full_name == table.full_name
-    assert data_platform_tables[0].datasource_server == "seraphic-music-344307"
     assert (
-        data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
-        == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name
+        data_platform_tables[0].urn
+        == "urn:li:dataset:(urn:li:dataPlatform:bigquery,seraphic-music-344307.school_dataset.first,PROD)"
     )
 
 
@@ -387,23 +428,24 @@ def test_google_bigquery_2():
     )
     reporter = PowerBiDashboardSourceReport()
 
+    ctx, config, platform_instance_resolver = get_default_instances()
+
     data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
         table,
         reporter,
-        native_query_enabled=False,
         parameters={
             "Parameter - Source": "my-test-project",
             "My bq project": "gcp_billing",
         },
+        ctx=ctx,
+        config=config,
+        platform_instance_resolver=platform_instance_resolver,
     )
 
     assert len(data_platform_tables) == 1
-    assert data_platform_tables[0].name == table.full_name.split(".")[2]
-    assert data_platform_tables[0].full_name == table.full_name
-    assert data_platform_tables[0].datasource_server == "my-test-project"
     assert (
-        data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
-        == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name
+        data_platform_tables[0].urn
+        == "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-test-project.gcp_billing.gcp_table,PROD)"
     )
 
 
@@ -416,23 +458,24 @@ def test_for_each_expression_1():
 
     reporter = PowerBiDashboardSourceReport()
 
+    ctx, config, platform_instance_resolver = get_default_instances()
+
     data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
         table,
         reporter,
-        native_query_enabled=False,
         parameters={
             "Parameter - Source": "my-test-project",
             "My bq project": "gcp_billing",
         },
+        ctx=ctx,
+        config=config,
+        platform_instance_resolver=platform_instance_resolver,
     )
 
     assert len(data_platform_tables) == 1
-    assert data_platform_tables[0].name == table.full_name.split(".")[2]
-    assert data_platform_tables[0].datasource_server == "my-test-project"
-    assert data_platform_tables[0].full_name == table.full_name
     assert (
-        data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
-        == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name
+        data_platform_tables[0].urn
+        == "urn:li:dataset:(urn:li:dataPlatform:bigquery,my-test-project.universal.d_wh_date,PROD)"
     )
 
 
@@ -445,22 +488,23 @@ def test_for_each_expression_2():
 
     reporter = PowerBiDashboardSourceReport()
 
+    ctx, config, platform_instance_resolver = get_default_instances()
+
     data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
         table,
         reporter,
-        native_query_enabled=False,
         parameters={
             "dwh-prod": "originally-not-a-variable-ref-and-not-resolved",
         },
+        ctx=ctx,
+        config=config,
+        platform_instance_resolver=platform_instance_resolver,
     )
 
     assert len(data_platform_tables) == 1
-    assert data_platform_tables[0].name == table.full_name.split(".")[2]
-    assert data_platform_tables[0].full_name == table.full_name
-    assert data_platform_tables[0].datasource_server == "dwh-prod"
     assert (
-        data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
-        == SupportedDataPlatform.GOOGLE_BIGQUERY.value.powerbi_data_platform_name
+        data_platform_tables[0].urn
+        == "urn:li:dataset:(urn:li:dataPlatform:bigquery,dwh-prod.gcp_billing.d_gcp_custom_label,PROD)"
     )
 
 
@@ -476,8 +520,14 @@ def test_native_query_disabled():
 
     reporter = PowerBiDashboardSourceReport()
 
+    ctx, config, platform_instance_resolver = get_default_instances()
+    config.native_query_parsing = False
     data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
-        table, reporter, native_query_enabled=False
+        table,
+        reporter,
+        ctx=ctx,
+        config=config,
+        platform_instance_resolver=platform_instance_resolver,
     )
     assert len(data_platform_tables) == 0
 
@@ -493,26 +543,25 @@ def test_multi_source_table():
     )
 
     reporter = PowerBiDashboardSourceReport()
+
+    ctx, config, platform_instance_resolver = get_default_instances()
+
     data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
-        table, reporter, native_query_enabled=False
+        table,
+        reporter,
+        ctx=ctx,
+        config=config,
+        platform_instance_resolver=platform_instance_resolver,
     )
 
     assert len(data_platform_tables) == 2
-    assert data_platform_tables[0].full_name == "mics.public.order_date"
-    assert data_platform_tables[0].datasource_server == "localhost"
-    assert (
-        data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
-        == SupportedDataPlatform.POSTGRES_SQL.value.powerbi_data_platform_name
-    )
-
-    assert data_platform_tables[1].full_name == "GSL_TEST_DB.PUBLIC.SALES_ANALYST_VIEW"
     assert (
-        data_platform_tables[1].datasource_server
-        == "ghh48144.snowflakefakecomputing.com"
+        data_platform_tables[0].urn
+        == "urn:li:dataset:(urn:li:dataPlatform:postgres,mics.public.order_date,PROD)"
     )
     assert (
-        data_platform_tables[1].data_platform_pair.powerbi_data_platform_name
-        == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name
+        data_platform_tables[1].urn
+        == "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_analyst_view,PROD)"
     )
 
 
@@ -521,36 +570,33 @@ def test_table_combine():
     table: powerbi_data_classes.Table = powerbi_data_classes.Table(
         columns=[],
         measures=[],
-        expression=M_QUERIES[16],  # 1st index has the native query
+        expression=M_QUERIES[16],
         name="virtual_order_table",
         full_name="OrderDataSet.virtual_order_table",
     )
 
     reporter = PowerBiDashboardSourceReport()
 
+    ctx, config, platform_instance_resolver = get_default_instances()
+
     data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
-        table, reporter
+        table,
+        reporter,
+        ctx=ctx,
+        config=config,
+        platform_instance_resolver=platform_instance_resolver,
     )
 
     assert len(data_platform_tables) == 2
-    assert data_platform_tables[0].full_name == "GSL_TEST_DB.PUBLIC.SALES_FORECAST"
-    assert (
-        data_platform_tables[0].datasource_server
-        == "ghh48144.snowflakefakecomputing.com"
-    )
-    assert (
-        data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
-        == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name
-    )
 
-    assert data_platform_tables[1].full_name == "GSL_TEST_DB.PUBLIC.SALES_ANALYST"
     assert (
-        data_platform_tables[1].datasource_server
-        == "ghh48144.snowflakefakecomputing.com"
+        data_platform_tables[0].urn
+        == "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_forecast,PROD)"
     )
+
     assert (
-        data_platform_tables[1].data_platform_pair.powerbi_data_platform_name
-        == SupportedDataPlatform.SNOWFLAKE.value.powerbi_data_platform_name
+        data_platform_tables[1].urn
+        == "urn:li:dataset:(urn:li:dataPlatform:snowflake,gsl_test_db.public.sales_analyst,PROD)"
     )
 
 
@@ -574,8 +620,14 @@ def test_expression_is_none():
 
     reporter = PowerBiDashboardSourceReport()
 
+    ctx, config, platform_instance_resolver = get_default_instances()
+
     data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
-        table, reporter
+        table,
+        reporter,
+        ctx=ctx,
+        config=config,
+        platform_instance_resolver=platform_instance_resolver,
     )
 
     assert len(data_platform_tables) == 0
@@ -589,15 +641,20 @@ def test_redshift_regular_case():
     )
     reporter = PowerBiDashboardSourceReport()
 
+    ctx, config, platform_instance_resolver = get_default_instances()
+
     data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
-        table, reporter, native_query_enabled=False
+        table,
+        reporter,
+        ctx=ctx,
+        config=config,
+        platform_instance_resolver=platform_instance_resolver,
     )
+
     assert len(data_platform_tables) == 1
-    assert data_platform_tables[0].name == table.full_name.split(".")[2]
-    assert data_platform_tables[0].full_name == table.full_name
     assert (
-        data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
-        == SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name
+        data_platform_tables[0].urn
+        == "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.category,PROD)"
     )
 
 
@@ -609,13 +666,60 @@ def test_redshift_native_query():
     )
     reporter = PowerBiDashboardSourceReport()
 
+    ctx, config, platform_instance_resolver = get_default_instances()
+
+    config.native_query_parsing = True
+
     data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
-        table, reporter, native_query_enabled=True
+        table,
+        reporter,
+        ctx=ctx,
+        config=config,
+        platform_instance_resolver=platform_instance_resolver,
     )
+
     assert len(data_platform_tables) == 1
-    assert data_platform_tables[0].name == table.full_name.split(".")[2]
-    assert data_platform_tables[0].full_name == table.full_name
     assert (
-        data_platform_tables[0].data_platform_pair.powerbi_data_platform_name
-        == SupportedDataPlatform.AMAZON_REDSHIFT.value.powerbi_data_platform_name
+        data_platform_tables[0].urn
+        == "urn:li:dataset:(urn:li:dataPlatform:redshift,dev.public.category,PROD)"
+    )
+
+
+def test_sqlglot_parser():
+    table: powerbi_data_classes.Table = powerbi_data_classes.Table(
+        expression=M_QUERIES[24],
+        name="SALES_TARGET",
+        full_name="dev.public.sales",
+    )
+    reporter = PowerBiDashboardSourceReport()
+
+    ctx, config, platform_instance_resolver = get_default_instances(
+        override_config={
+            "server_to_platform_instance": {
+                "bu10758.ap-unknown-2.fakecomputing.com": {
+                    "platform_instance": "sales_deployment",
+                    "env": "PROD",
+                }
+            },
+            "native_query_parsing": True,
+            "enable_advance_lineage_sql_construct": True,
+        }
+    )
+
+    data_platform_tables: List[DataPlatformTable] = parser.get_upstream_tables(
+        table,
+        reporter,
+        ctx=ctx,
+        config=config,
+        platform_instance_resolver=platform_instance_resolver,
+    )
+
+    assert len(data_platform_tables) == 2
+    assert (
+        data_platform_tables[0].urn
+        == "urn:li:dataset:(urn:li:dataPlatform:snowflake,sales_deployment.operations_analytics.transformed_prod.v_sme_unit,PROD)"
+    )
+    assert (
+        data_platform_tables[1].urn
+        == "urn:li:dataset:(urn:li:dataPlatform:snowflake,sales_deployment.operations_analytics.transformed_prod.v_sme_unit_targets,PROD)"
     )
diff --git a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
index d04c8d905b439..71428a7847953 100644
--- a/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
+++ b/metadata-ingestion/tests/integration/tableau/test_tableau_ingest.py
@@ -791,11 +791,9 @@ def test_tableau_unsupported_csql(mock_datahub_graph):
         database_override_map={"production database": "prod"}
     )
 
-    with mock.patch(
-        "datahub.ingestion.source.tableau.sqlglot_lineage"
-    ) as sqlglot_lineage:
+    with mock.patch("datahub.ingestion.source.tableau.sqlglot_l") as sqlglot_lineage:
 
-        sqlglot_lineage.return_value = SqlParsingResult(  # type:ignore
+        sqlglot_lineage.create_lineage_sql_parsed_result.return_value = SqlParsingResult(  # type:ignore
             in_tables=[
                 "urn:li:dataset:(urn:li:dataPlatform:bigquery,my_bigquery_project.invent_dw.userdetail,PROD)"
             ],

From 68abf9c6a1f0ccb9ad144247805781587c40ceeb Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz <andrew.sikowitz@acryl.io>
Date: Wed, 23 Aug 2023 07:25:51 -0400
Subject: [PATCH 09/11] build(ingest): Bump pydantic pin (#8660)

---
 metadata-ingestion/setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
index 4ff1d06bb8c22..62cb4f1abb8cf 100644
--- a/metadata-ingestion/setup.py
+++ b/metadata-ingestion/setup.py
@@ -454,7 +454,7 @@ def get_long_description():
     "mypy==1.0.0",
     # pydantic 1.8.2 is incompatible with mypy 0.910.
     # See https://github.com/samuelcolvin/pydantic/pull/3175#issuecomment-995382910.
-    "pydantic>=1.9.0",
+    "pydantic>=1.10.0",
     *test_api_requirements,
     pytest_dep,
     "pytest-asyncio>=0.16.0",

From 8141e2d64920f0511c531c493a3b61b5dc2ca026 Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz <andrew.sikowitz@acryl.io>
Date: Wed, 23 Aug 2023 15:57:46 -0400
Subject: [PATCH 10/11] remove(ingest/snowflake): Remove legacy snowflake
 lineage (#8653)

Co-authored-by: Tamas Nemeth <treff7es@gmail.com>
Co-authored-by: Aseem Bansal <asmbansal2@gmail.com>
---
 .../source/snowflake/snowflake_config.py      |  11 +-
 .../snowflake/snowflake_lineage_legacy.py     | 664 ------------------
 .../source/snowflake/snowflake_query.py       |  29 -
 .../source/snowflake/snowflake_v2.py          |  18 +-
 .../tests/integration/snowflake/common.py     |   9 -
 .../integration/snowflake/test_snowflake.py   |   2 -
 .../snowflake/test_snowflake_failures.py      |   1 -
 .../test_snowflake_failures_legacy_lineage.py | 291 --------
 .../test_snowflake_legacy_lineage.py          | 207 ------
 9 files changed, 6 insertions(+), 1226 deletions(-)
 delete mode 100644 metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py
 delete mode 100644 metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py
 delete mode 100644 metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py

diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py
index e8e80e172a9ce..7699d89ce9ac2 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py
@@ -91,13 +91,8 @@ class SnowflakeV2Config(
         description="Whether `schema_pattern` is matched against fully qualified schema name `<catalog>.<schema>`.",
     )
 
-    use_legacy_lineage_method: bool = Field(
-        default=False,
-        description=(
-            "Whether to use the legacy lineage computation method. "
-            "By default, uses new optimised lineage extraction method that requires less ingestion process memory. "
-            "Table-to-view and view-to-view column-level lineage are not supported with the legacy method."
-        ),
+    _use_legacy_lineage_method_removed = pydantic_removed_field(
+        "use_legacy_lineage_method"
     )
 
     validate_upstreams_against_patterns: bool = Field(
@@ -113,7 +108,7 @@ class SnowflakeV2Config(
     # This is required since access_history table does not capture whether the table was temporary table.
     temporary_tables_pattern: List[str] = Field(
         default=DEFAULT_TABLES_DENY_LIST,
-        description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to match the entire table name in database.schema.table format. Defaults are to set in such a way to ignore the temporary staging tables created by known ETL tools. Not used if `use_legacy_lineage_method=True`",
+        description="[Advanced] Regex patterns for temporary tables to filter in lineage ingestion. Specify regex to match the entire table name in database.schema.table format. Defaults are to set in such a way to ignore the temporary staging tables created by known ETL tools.",
     )
 
     rename_upstreams_deny_pattern_to_temporary_table_pattern = pydantic_renamed_field(
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py
deleted file mode 100644
index 832a072c619f8..0000000000000
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_lineage_legacy.py
+++ /dev/null
@@ -1,664 +0,0 @@
-import json
-import logging
-from collections import defaultdict
-from dataclasses import dataclass, field
-from typing import Any, Callable, Dict, FrozenSet, Iterable, List, Optional, Set
-
-from pydantic import Field
-from pydantic.error_wrappers import ValidationError
-from snowflake.connector import SnowflakeConnection
-
-import datahub.emitter.mce_builder as builder
-from datahub.emitter.mcp import MetadataChangeProposalWrapper
-from datahub.ingestion.api.workunit import MetadataWorkUnit
-from datahub.ingestion.source.aws.s3_util import make_s3_urn
-from datahub.ingestion.source.snowflake.constants import (
-    LINEAGE_PERMISSION_ERROR,
-    SnowflakeEdition,
-    SnowflakeObjectDomain,
-)
-from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config
-from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery
-from datahub.ingestion.source.snowflake.snowflake_report import SnowflakeV2Report
-from datahub.ingestion.source.snowflake.snowflake_usage_v2 import (
-    SnowflakeColumnReference,
-)
-from datahub.ingestion.source.snowflake.snowflake_utils import (
-    SnowflakeCommonMixin,
-    SnowflakeConnectionMixin,
-    SnowflakePermissionError,
-    SnowflakeQueryMixin,
-)
-from datahub.metadata.com.linkedin.pegasus2avro.dataset import (
-    FineGrainedLineage,
-    FineGrainedLineageDownstreamType,
-    FineGrainedLineageUpstreamType,
-    UpstreamLineage,
-)
-from datahub.metadata.schema_classes import DatasetLineageTypeClass, UpstreamClass
-from datahub.utilities.perf_timer import PerfTimer
-
-logger: logging.Logger = logging.getLogger(__name__)
-
-
-class SnowflakeColumnWithLineage(SnowflakeColumnReference):
-    class Config:
-        # This is for backward compatibility and can be removed later
-        allow_population_by_field_name = True
-
-    directSourceColumns: Optional[List[SnowflakeColumnReference]] = Field(
-        default=None, alias="directSources"
-    )
-
-
-@dataclass(frozen=True)
-class SnowflakeColumnId:
-    columnName: str
-    objectName: str
-    objectDomain: Optional[str] = None
-
-
-@dataclass(frozen=True)
-class SnowflakeColumnFineGrainedLineage:
-    """
-    Fie grained upstream of column,
-    which represents a transformation applied on input columns"""
-
-    inputColumns: FrozenSet[SnowflakeColumnId]
-    # Transform function, query etc can be added here
-
-
-@dataclass
-class SnowflakeColumnUpstreams:
-    """All upstreams of a column"""
-
-    upstreams: Set[SnowflakeColumnFineGrainedLineage] = field(
-        default_factory=set, init=False
-    )
-
-    def update_column_lineage(
-        self, directSourceColumns: List[SnowflakeColumnReference]
-    ) -> None:
-        input_columns = frozenset(
-            [
-                SnowflakeColumnId(
-                    upstream_col.columnName,
-                    upstream_col.objectName,
-                    upstream_col.objectDomain,
-                )
-                for upstream_col in directSourceColumns
-                if upstream_col.objectName
-            ]
-        )
-        if not input_columns:
-            return
-        upstream = SnowflakeColumnFineGrainedLineage(inputColumns=input_columns)
-        if upstream not in self.upstreams:
-            self.upstreams.add(upstream)
-
-
-@dataclass
-class SnowflakeUpstreamTable:
-    upstreamDataset: str
-    upstreamColumns: List[SnowflakeColumnReference]
-    downstreamColumns: List[SnowflakeColumnWithLineage]
-
-    @classmethod
-    def from_dict(
-        cls,
-        dataset: str,
-        upstreams_columns_json: Optional[str],
-        downstream_columns_json: Optional[str],
-    ) -> "SnowflakeUpstreamTable":
-        try:
-            upstreams_columns_list = []
-            downstream_columns_list = []
-            if upstreams_columns_json is not None:
-                upstreams_columns_list = json.loads(upstreams_columns_json)
-            if downstream_columns_json is not None:
-                downstream_columns_list = json.loads(downstream_columns_json)
-
-            table_with_upstreams = cls(
-                dataset,
-                [
-                    SnowflakeColumnReference.parse_obj(col)
-                    for col in upstreams_columns_list
-                ],
-                [
-                    SnowflakeColumnWithLineage.parse_obj(col)
-                    for col in downstream_columns_list
-                ],
-            )
-        except ValidationError:
-            # Earlier versions of column lineage did not include columnName, only columnId
-            table_with_upstreams = cls(dataset, [], [])
-        return table_with_upstreams
-
-
-@dataclass
-class SnowflakeTableLineage:
-    # key: upstream table name
-    upstreamTables: Dict[str, SnowflakeUpstreamTable] = field(
-        default_factory=dict, init=False
-    )
-
-    # key: downstream column name
-    columnLineages: Dict[str, SnowflakeColumnUpstreams] = field(
-        default_factory=lambda: defaultdict(SnowflakeColumnUpstreams), init=False
-    )
-
-    def update_lineage(
-        self, table: SnowflakeUpstreamTable, include_column_lineage: bool = True
-    ) -> None:
-        if table.upstreamDataset not in self.upstreamTables.keys():
-            self.upstreamTables[table.upstreamDataset] = table
-
-        if include_column_lineage and table.downstreamColumns:
-            for col in table.downstreamColumns:
-                if col.directSourceColumns:
-                    self.columnLineages[col.columnName].update_column_lineage(
-                        col.directSourceColumns
-                    )
-
-
-class SnowflakeLineageExtractor(
-    SnowflakeQueryMixin, SnowflakeConnectionMixin, SnowflakeCommonMixin
-):
-    """
-    Extracts Lineage from Snowflake.
-    Following lineage edges are considered.
-
-    1. "Table to View" lineage via `snowflake.account_usage.object_dependencies` view
-    2. "S3 to Table" lineage via `show external tables` query.
-    3. "View to Table" lineage via `snowflake.account_usage.access_history` view (requires Snowflake Enterprise Edition or above)
-    4. "Table to Table" lineage via `snowflake.account_usage.access_history` view (requires Snowflake Enterprise Edition or above)
-    5. "S3 to Table" lineage via `snowflake.account_usage.access_history` view (requires Snowflake Enterprise Edition or above)
-
-    Edition Note - Snowflake Standard Edition does not have Access History Feature. So it does not support lineage extraction for edges 3, 4, 5 mentioned above.
-    """
-
-    def __init__(
-        self,
-        config: SnowflakeV2Config,
-        report: SnowflakeV2Report,
-        dataset_urn_builder: Callable[[str], str],
-    ) -> None:
-        self._lineage_map: Dict[str, SnowflakeTableLineage] = defaultdict(
-            SnowflakeTableLineage
-        )
-        self._external_lineage_map: Dict[str, Set[str]] = defaultdict(set)
-        self.config = config
-        self.report = report
-        self.logger = logger
-        self.dataset_urn_builder = dataset_urn_builder
-        self.connection: Optional[SnowflakeConnection] = None
-
-    # Kwargs used by new snowflake lineage extractor need to be ignored here
-    def get_workunits(
-        self, discovered_tables: List[str], discovered_views: List[str], **_kwargs: Any
-    ) -> Iterable[MetadataWorkUnit]:
-        self.connection = self.create_connection()
-        if self.connection is None:
-            return
-
-        self._populate_table_lineage()
-
-        if self.config.include_view_lineage:
-            if len(discovered_views) > 0:
-                self._populate_view_lineage()
-            else:
-                logger.info("No views found. Skipping View Lineage Extraction.")
-
-        self._populate_external_lineage()
-
-        if (
-            len(self._lineage_map.keys()) == 0
-            and len(self._external_lineage_map.keys()) == 0
-        ):
-            logger.debug("No lineage found.")
-            return
-
-        yield from self.get_table_upstream_workunits(discovered_tables)
-        yield from self.get_view_upstream_workunits(discovered_views)
-
-    def _populate_table_lineage(self):
-        if self.report.edition == SnowflakeEdition.STANDARD:
-            logger.info(
-                "Snowflake Account is Standard Edition. Table to Table Lineage Feature is not supported."
-            )  # See Edition Note above for why
-        else:
-            with PerfTimer() as timer:
-                self._populate_lineage()
-                self.report.table_lineage_query_secs = timer.elapsed_seconds()
-
-    def get_table_upstream_workunits(self, discovered_tables):
-        if self.config.include_table_lineage:
-            for dataset_name in discovered_tables:
-                upstream_lineage = self._get_upstream_lineage_info(dataset_name)
-                if upstream_lineage is not None:
-                    yield MetadataChangeProposalWrapper(
-                        entityUrn=self.dataset_urn_builder(dataset_name),
-                        aspect=upstream_lineage,
-                    ).as_workunit()
-
-    def get_view_upstream_workunits(self, discovered_views):
-        if self.config.include_view_lineage:
-            for view_name in discovered_views:
-                upstream_lineage = self._get_upstream_lineage_info(view_name)
-                if upstream_lineage is not None:
-                    yield MetadataChangeProposalWrapper(
-                        entityUrn=self.dataset_urn_builder(view_name),
-                        aspect=upstream_lineage,
-                    ).as_workunit()
-
-    def _get_upstream_lineage_info(
-        self, dataset_name: str
-    ) -> Optional[UpstreamLineage]:
-        lineage = self._lineage_map[dataset_name]
-        external_lineage = self._external_lineage_map[dataset_name]
-        if not (lineage.upstreamTables or lineage.columnLineages or external_lineage):
-            logger.debug(f"No lineage found for {dataset_name}")
-            return None
-
-        upstream_tables: List[UpstreamClass] = []
-        finegrained_lineages: List[FineGrainedLineage] = []
-
-        # Populate the table-lineage in aspect
-        self.update_upstream_tables_lineage(upstream_tables, lineage)
-
-        # Populate the column-lineage in aspect
-        self.update_upstream_columns_lineage(
-            self.dataset_urn_builder(dataset_name), finegrained_lineages, lineage
-        )
-
-        # Populate the external-table-lineage(s3->snowflake) in aspect
-        self.update_external_tables_lineage(upstream_tables, external_lineage)
-
-        if len(upstream_tables) > 0:
-            logger.debug(
-                f"Upstream lineage of '{dataset_name}': {[u.dataset for u in upstream_tables]}"
-            )
-            if self.config.upstream_lineage_in_report:
-                self.report.upstream_lineage[dataset_name] = [
-                    u.dataset for u in upstream_tables
-                ]
-            return UpstreamLineage(
-                upstreams=upstream_tables,
-                fineGrainedLineages=sorted(
-                    finegrained_lineages, key=lambda x: (x.downstreams, x.upstreams)
-                )
-                or None,
-            )
-        else:
-            return None
-
-    def _populate_view_lineage(self) -> None:
-        with PerfTimer() as timer:
-            self._populate_view_upstream_lineage()
-            self.report.view_upstream_lineage_query_secs = timer.elapsed_seconds()
-
-        if self.report.edition == SnowflakeEdition.STANDARD:
-            logger.info(
-                "Snowflake Account is Standard Edition. View to Table Lineage Feature is not supported."
-            )  # See Edition Note above for why
-        else:
-            with PerfTimer() as timer:
-                self._populate_view_downstream_lineage()
-                self.report.view_downstream_lineage_query_secs = timer.elapsed_seconds()
-
-    def _populate_external_lineage(self) -> None:
-        with PerfTimer() as timer:
-            self.report.num_external_table_edges_scanned = 0
-
-            if self.report.edition == SnowflakeEdition.STANDARD:
-                logger.info(
-                    "Snowflake Account is Standard Edition. External Lineage Feature via Access History is not supported."
-                )  # See Edition Note above for why
-            else:
-                self._populate_external_lineage_from_access_history()
-
-            self._populate_external_lineage_from_show_query()
-
-            logger.info(
-                f"Found {self.report.num_external_table_edges_scanned} external lineage edges."
-            )
-
-            self.report.external_lineage_queries_secs = timer.elapsed_seconds()
-
-    # Handles the case for explicitly created external tables.
-    # NOTE: Snowflake does not log this information to the access_history table.
-    def _populate_external_lineage_from_show_query(self):
-        external_tables_query: str = SnowflakeQuery.show_external_tables()
-        try:
-            for db_row in self.query(external_tables_query):
-                key = self.get_dataset_identifier(
-                    db_row["name"], db_row["schema_name"], db_row["database_name"]
-                )
-
-                if not self._is_dataset_pattern_allowed(
-                    key, SnowflakeObjectDomain.TABLE
-                ):
-                    continue
-                self._external_lineage_map[key].add(db_row["location"])
-                logger.debug(
-                    f"ExternalLineage[Table(Down)={key}]:External(Up)={self._external_lineage_map[key]} via show external tables"
-                )
-                self.report.num_external_table_edges_scanned += 1
-        except Exception as e:
-            logger.debug(e, exc_info=e)
-            self.report_warning(
-                "external_lineage",
-                f"Populating external table lineage from Snowflake failed due to error {e}.",
-            )
-
-    # Handles the case where a table is populated from an external location via copy.
-    # Eg: copy into category_english from 's3://acryl-snow-demo-olist/olist_raw_data/category_english'credentials=(aws_key_id='...' aws_secret_key='...')  pattern='.*.csv';
-    def _populate_external_lineage_from_access_history(self):
-        query: str = SnowflakeQuery.external_table_lineage_history(
-            start_time_millis=int(self.config.start_time.timestamp() * 1000)
-            if not self.config.ignore_start_time_lineage
-            else 0,
-            end_time_millis=int(self.config.end_time.timestamp() * 1000),
-        )
-
-        try:
-            for db_row in self.query(query):
-                self._process_external_lineage_result_row(db_row)
-        except Exception as e:
-            if isinstance(e, SnowflakePermissionError):
-                error_msg = "Failed to get external lineage. Please grant imported privileges on SNOWFLAKE database. "
-                self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg)
-            else:
-                logger.debug(e, exc_info=e)
-                self.report_warning(
-                    "external_lineage",
-                    f"Populating table external lineage from Snowflake failed due to error {e}.",
-                )
-
-    def _process_external_lineage_result_row(self, db_row):
-        # key is the down-stream table name
-        key: str = self.get_dataset_identifier_from_qualified_name(
-            db_row["DOWNSTREAM_TABLE_NAME"]
-        )
-        if not self._is_dataset_pattern_allowed(key, SnowflakeObjectDomain.TABLE):
-            return
-
-        if db_row["UPSTREAM_LOCATIONS"] is not None:
-            external_locations = json.loads(db_row["UPSTREAM_LOCATIONS"])
-
-            for loc in external_locations:
-                if loc not in self._external_lineage_map[key]:
-                    self._external_lineage_map[key].add(loc)
-                    self.report.num_external_table_edges_scanned += 1
-
-            logger.debug(
-                f"ExternalLineage[Table(Down)={key}]:External(Up)={self._external_lineage_map[key]} via access_history"
-            )
-
-    def _populate_lineage(self) -> None:
-        query: str = SnowflakeQuery.table_to_table_lineage_history(
-            start_time_millis=int(self.config.start_time.timestamp() * 1000)
-            if not self.config.ignore_start_time_lineage
-            else 0,
-            end_time_millis=int(self.config.end_time.timestamp() * 1000),
-            include_column_lineage=self.config.include_column_lineage,
-        )
-        self.report.num_table_to_table_edges_scanned = 0
-        try:
-            for db_row in self.query(query):
-                self._process_table_lineage_row(db_row)
-        except Exception as e:
-            if isinstance(e, SnowflakePermissionError):
-                error_msg = "Failed to get table to table lineage. Please grant imported privileges on SNOWFLAKE database. "
-                self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg)
-            else:
-                logger.debug(e, exc_info=e)
-                self.report_warning(
-                    "table-lineage",
-                    f"Extracting lineage from Snowflake failed due to error {e}.",
-                )
-        logger.info(
-            f"A total of {self.report.num_table_to_table_edges_scanned} Table->Table edges found"
-            f" for {len(self._lineage_map)} downstream tables.",
-        )
-
-    def _process_table_lineage_row(self, db_row):
-        # key is the down-stream table name
-        key: str = self.get_dataset_identifier_from_qualified_name(
-            db_row["DOWNSTREAM_TABLE_NAME"]
-        )
-        upstream_table_name = self.get_dataset_identifier_from_qualified_name(
-            db_row["UPSTREAM_TABLE_NAME"]
-        )
-        if not self._is_dataset_pattern_allowed(
-            key, SnowflakeObjectDomain.TABLE
-        ) or not (
-            self._is_dataset_pattern_allowed(
-                upstream_table_name, SnowflakeObjectDomain.TABLE, is_upstream=True
-            )
-        ):
-            return
-        self._lineage_map[key].update_lineage(
-            # (<upstream_table_name>, <json_list_of_upstream_columns>, <json_list_of_downstream_columns>)
-            SnowflakeUpstreamTable.from_dict(
-                upstream_table_name,
-                db_row["UPSTREAM_TABLE_COLUMNS"],
-                db_row["DOWNSTREAM_TABLE_COLUMNS"],
-            ),
-            self.config.include_column_lineage,
-        )
-        self.report.num_table_to_table_edges_scanned += 1
-        logger.debug(f"Lineage[Table(Down)={key}]:Table(Up)={self._lineage_map[key]}")
-
-    def _populate_view_upstream_lineage(self) -> None:
-        # NOTE: This query captures only the upstream lineage of a view (with no column lineage).
-        # For more details see: https://docs.snowflake.com/en/user-guide/object-dependencies.html#object-dependencies
-        # and also https://docs.snowflake.com/en/sql-reference/account-usage/access_history.html#usage-notes for current limitations on capturing the lineage for views.
-        view_upstream_lineage_query: str = SnowflakeQuery.view_dependencies()
-
-        self.report.num_table_to_view_edges_scanned = 0
-
-        try:
-            for db_row in self.query(view_upstream_lineage_query):
-                self._process_view_upstream_lineage_row(db_row)
-        except Exception as e:
-            if isinstance(e, SnowflakePermissionError):
-                error_msg = "Failed to get table to view lineage. Please grant imported privileges on SNOWFLAKE database."
-                self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg)
-            else:
-                logger.debug(e, exc_info=e)
-                self.report_warning(
-                    "view-upstream-lineage",
-                    f"Extracting the upstream view lineage from Snowflake failed due to error {e}.",
-                )
-        logger.info(
-            f"A total of {self.report.num_table_to_view_edges_scanned} View upstream edges found."
-        )
-
-    def _process_view_upstream_lineage_row(self, db_row):
-        # Process UpstreamTable/View/ExternalTable/Materialized View->View edge.
-        view_upstream: str = self.get_dataset_identifier_from_qualified_name(
-            db_row["VIEW_UPSTREAM"]
-        )
-        view_name: str = self.get_dataset_identifier_from_qualified_name(
-            db_row["DOWNSTREAM_VIEW"]
-        )
-
-        if not self._is_dataset_pattern_allowed(
-            dataset_name=view_name,
-            dataset_type=db_row["REFERENCING_OBJECT_DOMAIN"],
-        ) or not self._is_dataset_pattern_allowed(
-            view_upstream, db_row["REFERENCED_OBJECT_DOMAIN"], is_upstream=True
-        ):
-            return
-            # key is the downstream view name
-        self._lineage_map[view_name].update_lineage(
-            # (<upstream_table_name>, <empty_json_list_of_upstream_table_columns>, <empty_json_list_of_downstream_view_columns>)
-            SnowflakeUpstreamTable.from_dict(view_upstream, None, None),
-            self.config.include_column_lineage,
-        )
-        self.report.num_table_to_view_edges_scanned += 1
-        logger.debug(
-            f"Upstream->View: Lineage[View(Down)={view_name}]:Upstream={view_upstream}"
-        )
-
-    def _populate_view_downstream_lineage(self) -> None:
-        # This query captures the downstream table lineage for views.
-        # See https://docs.snowflake.com/en/sql-reference/account-usage/access_history.html#usage-notes for current limitations on capturing the lineage for views.
-        # Eg: For viewA->viewB->ViewC->TableD, snowflake does not yet log intermediate view logs, resulting in only the viewA->TableD edge.
-        view_lineage_query: str = SnowflakeQuery.view_lineage_history(
-            start_time_millis=int(self.config.start_time.timestamp() * 1000)
-            if not self.config.ignore_start_time_lineage
-            else 0,
-            end_time_millis=int(self.config.end_time.timestamp() * 1000),
-            include_column_lineage=self.config.include_column_lineage,
-        )
-
-        self.report.num_view_to_table_edges_scanned = 0
-
-        try:
-            for db_row in self.query(view_lineage_query):
-                self._process_view_downstream_lineage_row(db_row)
-        except Exception as e:
-            if isinstance(e, SnowflakePermissionError):
-                error_msg = "Failed to get view to table lineage. Please grant imported privileges on SNOWFLAKE database. "
-                self.warn_if_stateful_else_error(LINEAGE_PERMISSION_ERROR, error_msg)
-            else:
-                logger.debug(e, exc_info=e)
-                self.report_warning(
-                    "view-downstream-lineage",
-                    f"Extracting the view lineage from Snowflake failed due to error {e}.",
-                )
-
-        logger.info(
-            f"Found {self.report.num_view_to_table_edges_scanned} View->Table edges."
-        )
-
-    def _process_view_downstream_lineage_row(self, db_row):
-        view_name: str = self.get_dataset_identifier_from_qualified_name(
-            db_row["VIEW_NAME"]
-        )
-        downstream_table: str = self.get_dataset_identifier_from_qualified_name(
-            db_row["DOWNSTREAM_TABLE_NAME"]
-        )
-        if not self._is_dataset_pattern_allowed(
-            view_name, db_row["VIEW_DOMAIN"], is_upstream=True
-        ) or not self._is_dataset_pattern_allowed(
-            downstream_table, db_row["DOWNSTREAM_TABLE_DOMAIN"]
-        ):
-            return
-
-            # Capture view->downstream table lineage.
-        self._lineage_map[downstream_table].update_lineage(
-            # (<upstream_view_name>, <json_list_of_upstream_view_columns>, <json_list_of_downstream_columns>)
-            SnowflakeUpstreamTable.from_dict(
-                view_name,
-                db_row["VIEW_COLUMNS"],
-                db_row["DOWNSTREAM_TABLE_COLUMNS"],
-            ),
-            self.config.include_column_lineage,
-        )
-        self.report.num_view_to_table_edges_scanned += 1
-
-        logger.debug(
-            f"View->Table: Lineage[Table(Down)={downstream_table}]:View(Up)={self._lineage_map[downstream_table]}"
-        )
-
-    def update_upstream_tables_lineage(
-        self, upstream_tables: List[UpstreamClass], lineage: SnowflakeTableLineage
-    ) -> None:
-        for lineage_entry in sorted(
-            lineage.upstreamTables.values(), key=lambda x: x.upstreamDataset
-        ):
-            upstream_table_name = lineage_entry.upstreamDataset
-            upstream_table = UpstreamClass(
-                dataset=self.dataset_urn_builder(upstream_table_name),
-                type=DatasetLineageTypeClass.TRANSFORMED,
-            )
-            upstream_tables.append(upstream_table)
-
-    def update_upstream_columns_lineage(
-        self,
-        dataset_urn: str,
-        finegrained_lineages: List[FineGrainedLineage],
-        lineage: SnowflakeTableLineage,
-    ) -> None:
-        # For every column for which upstream lineage is available
-        for col, col_upstreams in lineage.columnLineages.items():
-            # For every upstream of column
-            self.update_upstream_columns_lineage_of_column(
-                dataset_urn, col, finegrained_lineages, col_upstreams
-            )
-
-    def update_upstream_columns_lineage_of_column(
-        self,
-        dataset_urn: str,
-        col: str,
-        finegrained_lineages: List[FineGrainedLineage],
-        col_upstreams: SnowflakeColumnUpstreams,
-    ) -> None:
-        for fine_upstream in col_upstreams.upstreams:
-            finegrained_lineage_entry = self.build_finegrained_lineage(
-                dataset_urn, col, fine_upstream
-            )
-            if finegrained_lineage_entry.upstreams:
-                finegrained_lineages.append(finegrained_lineage_entry)
-
-    def build_finegrained_lineage(
-        self,
-        dataset_urn: str,
-        col: str,
-        fine_upstream: SnowflakeColumnFineGrainedLineage,
-    ) -> FineGrainedLineage:
-        fieldPath = col
-
-        column_upstreams = self.build_finegrained_lineage_upstreams(fine_upstream)
-        finegrained_lineage_entry = FineGrainedLineage(
-            upstreamType=FineGrainedLineageUpstreamType.FIELD_SET,
-            # Sorting the list of upstream lineage events in order to avoid creating multiple aspects in backend
-            # even if the lineage is same but the order is different.
-            upstreams=sorted(column_upstreams),
-            downstreamType=FineGrainedLineageDownstreamType.FIELD,
-            downstreams=[
-                builder.make_schema_field_urn(
-                    dataset_urn, self.snowflake_identifier(fieldPath)
-                )
-            ],
-        )
-
-        return finegrained_lineage_entry
-
-    def build_finegrained_lineage_upstreams(
-        self, fine_upstream: SnowflakeColumnFineGrainedLineage
-    ) -> List[str]:
-        column_upstreams = []
-        for upstream_col in fine_upstream.inputColumns:
-            if (
-                upstream_col.objectName
-                and upstream_col.columnName
-                and self._is_dataset_pattern_allowed(
-                    upstream_col.objectName, upstream_col.objectDomain, is_upstream=True
-                )
-            ):
-                upstream_dataset_name = self.get_dataset_identifier_from_qualified_name(
-                    upstream_col.objectName
-                )
-                column_upstreams.append(
-                    builder.make_schema_field_urn(
-                        self.dataset_urn_builder(upstream_dataset_name),
-                        self.snowflake_identifier(upstream_col.columnName),
-                    )
-                )
-        return column_upstreams
-
-    def update_external_tables_lineage(
-        self, upstream_tables: List[UpstreamClass], external_lineage: Set[str]
-    ) -> None:
-        for external_lineage_entry in sorted(external_lineage):
-            # For now, populate only for S3
-            if external_lineage_entry.startswith("s3://"):
-                external_upstream_table = UpstreamClass(
-                    dataset=make_s3_urn(external_lineage_entry, self.config.env),
-                    type=DatasetLineageTypeClass.COPY,
-                )
-                upstream_tables.append(external_upstream_table)
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py
index 587c71a98be67..0f89324f5efc6 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_query.py
@@ -506,35 +506,6 @@ def view_dependencies_v2() -> str:
     def show_external_tables() -> str:
         return "show external tables in account"
 
-    # Note - This method should be removed once legacy lineage is removed
-    @staticmethod
-    def external_table_lineage_history(
-        start_time_millis: int, end_time_millis: int
-    ) -> str:
-        return f"""
-        WITH external_table_lineage_history AS (
-            SELECT
-                r.value:"locations" AS upstream_locations,
-                w.value:"objectName"::varchar AS downstream_table_name,
-                w.value:"objectDomain"::varchar AS downstream_table_domain,
-                w.value:"columns" AS downstream_table_columns,
-                t.query_start_time AS query_start_time
-            FROM
-                (SELECT * from snowflake.account_usage.access_history) t,
-                lateral flatten(input => t.BASE_OBJECTS_ACCESSED) r,
-                lateral flatten(input => t.OBJECTS_MODIFIED) w
-            WHERE r.value:"locations" IS NOT NULL
-            AND w.value:"objectId" IS NOT NULL
-            AND t.query_start_time >= to_timestamp_ltz({start_time_millis}, 3)
-            AND t.query_start_time < to_timestamp_ltz({end_time_millis}, 3))
-        SELECT
-        upstream_locations AS "UPSTREAM_LOCATIONS",
-        downstream_table_name AS "DOWNSTREAM_TABLE_NAME",
-        downstream_table_columns AS "DOWNSTREAM_TABLE_COLUMNS"
-        FROM external_table_lineage_history
-        WHERE downstream_table_domain = '{SnowflakeObjectDomain.TABLE.capitalize()}'
-        QUALIFY ROW_NUMBER() OVER (PARTITION BY downstream_table_name ORDER BY query_start_time DESC) = 1"""
-
     @staticmethod
     def copy_lineage_history(
         start_time_millis: int,
diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
index 7dd51d5b20e8e..40c4d32525a51 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_v2.py
@@ -51,9 +51,6 @@
     SnowflakeV2Config,
     TagOption,
 )
-from datahub.ingestion.source.snowflake.snowflake_lineage_legacy import (
-    SnowflakeLineageExtractor as SnowflakeLineageLegacyExtractor,
-)
 from datahub.ingestion.source.snowflake.snowflake_lineage_v2 import (
     SnowflakeLineageExtractor,
 )
@@ -240,19 +237,10 @@ def __init__(self, ctx: PipelineContext, config: SnowflakeV2Config):
         # For database, schema, tables, views, etc
         self.data_dictionary = SnowflakeDataDictionary()
 
-        self.lineage_extractor: Union[
-            SnowflakeLineageExtractor, SnowflakeLineageLegacyExtractor
-        ]
         if config.include_table_lineage:
-            # For lineage
-            if self.config.use_legacy_lineage_method:
-                self.lineage_extractor = SnowflakeLineageLegacyExtractor(
-                    config, self.report, dataset_urn_builder=self.gen_dataset_urn
-                )
-            else:
-                self.lineage_extractor = SnowflakeLineageExtractor(
-                    config, self.report, dataset_urn_builder=self.gen_dataset_urn
-                )
+            self.lineage_extractor = SnowflakeLineageExtractor(
+                config, self.report, dataset_urn_builder=self.gen_dataset_urn
+            )
 
         if config.include_usage_stats or config.include_operational_stats:
             self.usage_extractor = SnowflakeUsageExtractor(
diff --git a/metadata-ingestion/tests/integration/snowflake/common.py b/metadata-ingestion/tests/integration/snowflake/common.py
index 43f5e04fbc89f..81e307a78ae9e 100644
--- a/metadata-ingestion/tests/integration/snowflake/common.py
+++ b/metadata-ingestion/tests/integration/snowflake/common.py
@@ -434,11 +434,6 @@ def default_query_results(  # noqa: C901
             }
             for op_idx in range(1, num_ops + 1)
         ]
-    elif query == snowflake_query.SnowflakeQuery.external_table_lineage_history(
-        1654473600000,
-        1654586220000,
-    ):
-        return []
     elif query in [
         snowflake_query.SnowflakeQuery.view_dependencies(),
     ]:
@@ -509,10 +504,6 @@ def default_query_results(  # noqa: C901
             }
         ]
     elif query in [
-        snowflake_query.SnowflakeQuery.external_table_lineage_history(
-            1654473600000,
-            1654586220000,
-        ),
         snowflake_query.SnowflakeQuery.view_dependencies_v2(),
         snowflake_query.SnowflakeQuery.view_dependencies(),
         snowflake_query.SnowflakeQuery.show_external_tables(),
diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py
index 53b2bcb236cd9..6135b0b3b3274 100644
--- a/metadata-ingestion/tests/integration/snowflake/test_snowflake.py
+++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake.py
@@ -121,7 +121,6 @@ def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph):
                         include_table_lineage=True,
                         include_view_lineage=True,
                         include_usage_stats=True,
-                        use_legacy_lineage_method=False,
                         validate_upstreams_against_patterns=False,
                         include_operational_stats=True,
                         email_as_user_identifier=True,
@@ -213,7 +212,6 @@ def test_snowflake_private_link(pytestconfig, tmp_path, mock_time, mock_datahub_
                         include_column_lineage=False,
                         include_views=False,
                         include_view_lineage=False,
-                        use_legacy_lineage_method=False,
                         include_usage_stats=False,
                         include_operational_stats=False,
                         start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace(
diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py
index 73a261bb3cb6e..4963e71ae4d96 100644
--- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py
+++ b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures.py
@@ -55,7 +55,6 @@ def snowflake_pipeline_config(tmp_path):
                 schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]),
                 include_view_lineage=False,
                 include_usage_stats=False,
-                use_legacy_lineage_method=False,
                 start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace(
                     tzinfo=timezone.utc
                 ),
diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py
deleted file mode 100644
index a5993793e574d..0000000000000
--- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_failures_legacy_lineage.py
+++ /dev/null
@@ -1,291 +0,0 @@
-from datetime import datetime, timezone
-from typing import cast
-from unittest import mock
-
-from freezegun import freeze_time
-from pytest import fixture
-
-from datahub.configuration.common import AllowDenyPattern, DynamicTypedConfig
-from datahub.ingestion.run.pipeline import Pipeline
-from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig
-from datahub.ingestion.source.snowflake import snowflake_query
-from datahub.ingestion.source.snowflake.snowflake_config import SnowflakeV2Config
-from datahub.ingestion.source.snowflake.snowflake_query import SnowflakeQuery
-from tests.integration.snowflake.common import (
-    FROZEN_TIME,
-    NUM_TABLES,
-    default_query_results,
-)
-
-
-def query_permission_error_override(fn, override_for_query, error_msg):
-    def my_function(query):
-        if query in override_for_query:
-            raise Exception(error_msg)
-        else:
-            return fn(query)
-
-    return my_function
-
-
-def query_permission_response_override(fn, override_for_query, response):
-    def my_function(query):
-        if query in override_for_query:
-            return response
-        else:
-            return fn(query)
-
-    return my_function
-
-
-@fixture(scope="function")
-def snowflake_pipeline_legacy_lineage_config(tmp_path):
-    output_file = tmp_path / "snowflake_test_events_permission_error.json"
-    config = PipelineConfig(
-        source=SourceConfig(
-            type="snowflake",
-            config=SnowflakeV2Config(
-                account_id="ABC12345.ap-south-1.aws",
-                username="TST_USR",
-                password="TST_PWD",
-                role="TEST_ROLE",
-                warehouse="TEST_WAREHOUSE",
-                include_technical_schema=True,
-                match_fully_qualified_names=True,
-                schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]),
-                include_view_lineage=False,
-                include_usage_stats=False,
-                use_legacy_lineage_method=True,
-                start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace(
-                    tzinfo=timezone.utc
-                ),
-                end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace(tzinfo=timezone.utc),
-            ),
-        ),
-        sink=DynamicTypedConfig(type="file", config={"filename": str(output_file)}),
-    )
-    return config
-
-
-@freeze_time(FROZEN_TIME)
-def test_snowflake_missing_role_access_causes_pipeline_failure(
-    pytestconfig,
-    snowflake_pipeline_legacy_lineage_config,
-):
-    with mock.patch("snowflake.connector.connect") as mock_connect:
-        # Snowflake connection fails role not granted error
-        mock_connect.side_effect = Exception(
-            "250001 (08001): Failed to connect to DB: abc12345.ap-south-1.snowflakecomputing.com:443. Role 'TEST_ROLE' specified in the connect string is not granted to this user. Contact your local system administrator, or attempt to login with another role, e.g. PUBLIC"
-        )
-
-        pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config)
-        pipeline.run()
-        assert "permission-error" in pipeline.source.get_report().failures.keys()
-
-
-@freeze_time(FROZEN_TIME)
-def test_snowflake_missing_warehouse_access_causes_pipeline_failure(
-    pytestconfig,
-    snowflake_pipeline_legacy_lineage_config,
-):
-    with mock.patch("snowflake.connector.connect") as mock_connect:
-        sf_connection = mock.MagicMock()
-        sf_cursor = mock.MagicMock()
-        mock_connect.return_value = sf_connection
-        sf_connection.cursor.return_value = sf_cursor
-
-        # Current warehouse query leads to blank result
-        sf_cursor.execute.side_effect = query_permission_response_override(
-            default_query_results,
-            [SnowflakeQuery.current_warehouse()],
-            [(None,)],
-        )
-        pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config)
-        pipeline.run()
-        assert "permission-error" in pipeline.source.get_report().failures.keys()
-
-
-@freeze_time(FROZEN_TIME)
-def test_snowflake_no_databases_with_access_causes_pipeline_failure(
-    pytestconfig,
-    snowflake_pipeline_legacy_lineage_config,
-):
-    with mock.patch("snowflake.connector.connect") as mock_connect:
-        sf_connection = mock.MagicMock()
-        sf_cursor = mock.MagicMock()
-        mock_connect.return_value = sf_connection
-        sf_connection.cursor.return_value = sf_cursor
-
-        # Error in listing databases
-        sf_cursor.execute.side_effect = query_permission_error_override(
-            default_query_results,
-            [SnowflakeQuery.get_databases("TEST_DB")],
-            "Database 'TEST_DB' does not exist or not authorized.",
-        )
-        pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config)
-        pipeline.run()
-        assert "permission-error" in pipeline.source.get_report().failures.keys()
-
-
-@freeze_time(FROZEN_TIME)
-def test_snowflake_no_tables_causes_pipeline_failure(
-    pytestconfig,
-    snowflake_pipeline_legacy_lineage_config,
-):
-    with mock.patch("snowflake.connector.connect") as mock_connect:
-        sf_connection = mock.MagicMock()
-        sf_cursor = mock.MagicMock()
-        mock_connect.return_value = sf_connection
-        sf_connection.cursor.return_value = sf_cursor
-
-        # Error in listing databases
-        no_tables_fn = query_permission_response_override(
-            default_query_results,
-            [SnowflakeQuery.tables_for_schema("TEST_SCHEMA", "TEST_DB")],
-            [],
-        )
-        sf_cursor.execute.side_effect = query_permission_response_override(
-            no_tables_fn,
-            [SnowflakeQuery.show_views_for_schema("TEST_SCHEMA", "TEST_DB")],
-            [],
-        )
-
-        pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config)
-        pipeline.run()
-        assert "permission-error" in pipeline.source.get_report().failures.keys()
-
-
-@freeze_time(FROZEN_TIME)
-def test_snowflake_list_columns_error_causes_pipeline_warning(
-    pytestconfig,
-    snowflake_pipeline_legacy_lineage_config,
-):
-    with mock.patch("snowflake.connector.connect") as mock_connect:
-        sf_connection = mock.MagicMock()
-        sf_cursor = mock.MagicMock()
-        mock_connect.return_value = sf_connection
-        sf_connection.cursor.return_value = sf_cursor
-
-        # Error in listing columns
-        sf_cursor.execute.side_effect = query_permission_error_override(
-            default_query_results,
-            [
-                SnowflakeQuery.columns_for_table(
-                    "TABLE_{}".format(tbl_idx), "TEST_SCHEMA", "TEST_DB"
-                )
-                for tbl_idx in range(1, NUM_TABLES + 1)
-            ],
-            "Database 'TEST_DB' does not exist or not authorized.",
-        )
-        pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config)
-        pipeline.run()
-        pipeline.raise_from_status()  # pipeline should not fail
-        assert (
-            "Failed to get columns for table"
-            in pipeline.source.get_report().warnings.keys()
-        )
-
-
-@freeze_time(FROZEN_TIME)
-def test_snowflake_list_primary_keys_error_causes_pipeline_warning(
-    pytestconfig,
-    snowflake_pipeline_legacy_lineage_config,
-):
-    with mock.patch("snowflake.connector.connect") as mock_connect:
-        sf_connection = mock.MagicMock()
-        sf_cursor = mock.MagicMock()
-        mock_connect.return_value = sf_connection
-        sf_connection.cursor.return_value = sf_cursor
-
-        # Error in listing keys leads to warning
-        sf_cursor.execute.side_effect = query_permission_error_override(
-            default_query_results,
-            [SnowflakeQuery.show_primary_keys_for_schema("TEST_SCHEMA", "TEST_DB")],
-            "Insufficient privileges to operate on TEST_DB",
-        )
-        pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config)
-        pipeline.run()
-        pipeline.raise_from_status()  # pipeline should not fail
-        assert (
-            "Failed to get primary key for table"
-            in pipeline.source.get_report().warnings.keys()
-        )
-
-
-@freeze_time(FROZEN_TIME)
-def test_snowflake_missing_snowflake_lineage_permission_causes_pipeline_failure(
-    pytestconfig,
-    snowflake_pipeline_legacy_lineage_config,
-):
-    with mock.patch("snowflake.connector.connect") as mock_connect:
-        sf_connection = mock.MagicMock()
-        sf_cursor = mock.MagicMock()
-        mock_connect.return_value = sf_connection
-        sf_connection.cursor.return_value = sf_cursor
-
-        # Error in getting lineage
-        sf_cursor.execute.side_effect = query_permission_error_override(
-            default_query_results,
-            [
-                snowflake_query.SnowflakeQuery.table_to_table_lineage_history(
-                    1654473600000, 1654586220000, True
-                ),
-            ],
-            "Database 'SNOWFLAKE' does not exist or not authorized.",
-        )
-        pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config)
-        pipeline.run()
-        assert (
-            "lineage-permission-error" in pipeline.source.get_report().failures.keys()
-        )
-
-
-@freeze_time(FROZEN_TIME)
-def test_snowflake_missing_snowflake_operations_permission_causes_pipeline_failure(
-    pytestconfig,
-    snowflake_pipeline_legacy_lineage_config,
-):
-    with mock.patch("snowflake.connector.connect") as mock_connect:
-        sf_connection = mock.MagicMock()
-        sf_cursor = mock.MagicMock()
-        mock_connect.return_value = sf_connection
-        sf_connection.cursor.return_value = sf_cursor
-
-        # Error in getting access history date range
-        sf_cursor.execute.side_effect = query_permission_error_override(
-            default_query_results,
-            [snowflake_query.SnowflakeQuery.get_access_history_date_range()],
-            "Database 'SNOWFLAKE' does not exist or not authorized.",
-        )
-        pipeline = Pipeline(snowflake_pipeline_legacy_lineage_config)
-        pipeline.run()
-        assert "usage-permission-error" in pipeline.source.get_report().failures.keys()
-
-
-@freeze_time(FROZEN_TIME)
-def test_snowflake_unexpected_snowflake_view_lineage_error_causes_pipeline_warning(
-    pytestconfig,
-    snowflake_pipeline_legacy_lineage_config,
-):
-    with mock.patch("snowflake.connector.connect") as mock_connect:
-        sf_connection = mock.MagicMock()
-        sf_cursor = mock.MagicMock()
-        mock_connect.return_value = sf_connection
-        sf_connection.cursor.return_value = sf_cursor
-
-        # Error in getting view lineage
-        sf_cursor.execute.side_effect = query_permission_error_override(
-            default_query_results,
-            [snowflake_query.SnowflakeQuery.view_dependencies()],
-            "Unexpected Error",
-        )
-
-        snowflake_pipeline_config1 = snowflake_pipeline_legacy_lineage_config.copy()
-        cast(
-            SnowflakeV2Config,
-            cast(PipelineConfig, snowflake_pipeline_config1).source.config,
-        ).include_view_lineage = True
-        pipeline = Pipeline(snowflake_pipeline_config1)
-        pipeline.run()
-        pipeline.raise_from_status()  # pipeline should not fail
-        assert "view-upstream-lineage" in pipeline.source.get_report().warnings.keys()
diff --git a/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py b/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py
deleted file mode 100644
index 59da7ddf695d8..0000000000000
--- a/metadata-ingestion/tests/integration/snowflake/test_snowflake_legacy_lineage.py
+++ /dev/null
@@ -1,207 +0,0 @@
-import random
-from datetime import datetime, timezone
-from unittest import mock
-
-import pandas as pd
-import pytest
-from freezegun import freeze_time
-
-from datahub.configuration.common import AllowDenyPattern, DynamicTypedConfig
-from datahub.ingestion.glossary.classifier import (
-    ClassificationConfig,
-    DynamicTypedClassifierConfig,
-)
-from datahub.ingestion.glossary.datahub_classifier import (
-    DataHubClassifierConfig,
-    InfoTypeConfig,
-    PredictionFactorsAndWeights,
-    ValuesFactorConfig,
-)
-from datahub.ingestion.run.pipeline import Pipeline
-from datahub.ingestion.run.pipeline_config import PipelineConfig, SourceConfig
-from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig
-from datahub.ingestion.source.snowflake.snowflake_config import (
-    SnowflakeV2Config,
-    TagOption,
-)
-from tests.integration.snowflake.common import FROZEN_TIME, default_query_results
-from tests.integration.snowflake.test_snowflake import random_cloud_region, random_email
-from tests.test_helpers import mce_helpers
-
-
-@pytest.mark.integration
-def test_snowflake_basic(pytestconfig, tmp_path, mock_time, mock_datahub_graph):
-    test_resources_dir = pytestconfig.rootpath / "tests/integration/snowflake"
-
-    # Run the metadata ingestion pipeline.
-    output_file = tmp_path / "snowflake_test_events.json"
-    golden_file = test_resources_dir / "snowflake_golden.json"
-
-    with mock.patch("snowflake.connector.connect") as mock_connect, mock.patch(
-        "datahub.ingestion.source.snowflake.snowflake_v2.SnowflakeV2Source.get_sample_values_for_table"
-    ) as mock_sample_values:
-        sf_connection = mock.MagicMock()
-        sf_cursor = mock.MagicMock()
-        mock_connect.return_value = sf_connection
-        sf_connection.cursor.return_value = sf_cursor
-
-        sf_cursor.execute.side_effect = default_query_results
-
-        mock_sample_values.return_value = pd.DataFrame(
-            data={
-                "col_1": [random.randint(1, 80) for i in range(20)],
-                "col_2": [random_email() for i in range(20)],
-                "col_3": [random_cloud_region() for i in range(20)],
-            }
-        )
-
-        datahub_classifier_config = DataHubClassifierConfig(
-            minimum_values_threshold=10,
-            confidence_level_threshold=0.58,
-            info_types_config={
-                "Age": InfoTypeConfig(
-                    Prediction_Factors_and_Weights=PredictionFactorsAndWeights(
-                        Name=0, Values=1, Description=0, Datatype=0
-                    )
-                ),
-                "CloudRegion": InfoTypeConfig(
-                    Prediction_Factors_and_Weights=PredictionFactorsAndWeights(
-                        Name=0,
-                        Description=0,
-                        Datatype=0,
-                        Values=1,
-                    ),
-                    Values=ValuesFactorConfig(
-                        prediction_type="regex",
-                        regex=[
-                            r"(af|ap|ca|eu|me|sa|us)-(central|north|(north(?:east|west))|south|south(?:east|west)|east|west)-\d+"
-                        ],
-                    ),
-                ),
-            },
-        )
-
-        pipeline = Pipeline(
-            config=PipelineConfig(
-                source=SourceConfig(
-                    type="snowflake",
-                    config=SnowflakeV2Config(
-                        account_id="ABC12345.ap-south-1.aws",
-                        username="TST_USR",
-                        password="TST_PWD",
-                        match_fully_qualified_names=True,
-                        schema_pattern=AllowDenyPattern(allow=["test_db.test_schema"]),
-                        include_technical_schema=True,
-                        include_table_lineage=True,
-                        include_view_lineage=True,
-                        include_usage_stats=True,
-                        use_legacy_lineage_method=True,
-                        validate_upstreams_against_patterns=False,
-                        include_operational_stats=True,
-                        start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace(
-                            tzinfo=timezone.utc
-                        ),
-                        end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace(
-                            tzinfo=timezone.utc
-                        ),
-                        classification=ClassificationConfig(
-                            enabled=True,
-                            classifiers=[
-                                DynamicTypedClassifierConfig(
-                                    type="datahub", config=datahub_classifier_config
-                                )
-                            ],
-                        ),
-                        profiling=GEProfilingConfig(
-                            enabled=True,
-                            profile_if_updated_since_days=None,
-                            profile_table_row_limit=None,
-                            profile_table_size_limit=None,
-                            profile_table_level_only=True,
-                        ),
-                        extract_tags=TagOption.without_lineage,
-                    ),
-                ),
-                sink=DynamicTypedConfig(
-                    type="file", config={"filename": str(output_file)}
-                ),
-            )
-        )
-        pipeline.run()
-        pipeline.pretty_print_summary()
-        pipeline.raise_from_status()
-
-        # Verify the output.
-
-        mce_helpers.check_golden_file(
-            pytestconfig,
-            output_path=output_file,
-            golden_path=golden_file,
-            ignore_paths=[
-                r"root\[\d+\]\['aspect'\]\['json'\]\['timestampMillis'\]",
-                r"root\[\d+\]\['aspect'\]\['json'\]\['created'\]",
-                r"root\[\d+\]\['aspect'\]\['json'\]\['lastModified'\]",
-                r"root\[\d+\]\['aspect'\]\['json'\]\['fields'\]\[\d+\]\['glossaryTerms'\]\['auditStamp'\]\['time'\]",
-                r"root\[\d+\]\['systemMetadata'\]",
-            ],
-        )
-
-
-@freeze_time(FROZEN_TIME)
-@pytest.mark.integration
-def test_snowflake_private_link(pytestconfig, tmp_path, mock_time, mock_datahub_graph):
-    test_resources_dir = pytestconfig.rootpath / "tests/integration/snowflake"
-
-    # Run the metadata ingestion pipeline.
-    output_file = tmp_path / "snowflake_privatelink_test_events.json"
-    golden_file = test_resources_dir / "snowflake_privatelink_golden.json"
-
-    with mock.patch("snowflake.connector.connect") as mock_connect:
-        sf_connection = mock.MagicMock()
-        sf_cursor = mock.MagicMock()
-        mock_connect.return_value = sf_connection
-        sf_connection.cursor.return_value = sf_cursor
-        sf_cursor.execute.side_effect = default_query_results
-
-        pipeline = Pipeline(
-            config=PipelineConfig(
-                source=SourceConfig(
-                    type="snowflake",
-                    config=SnowflakeV2Config(
-                        account_id="ABC12345.ap-south-1.privatelink",
-                        username="TST_USR",
-                        password="TST_PWD",
-                        schema_pattern=AllowDenyPattern(allow=["test_schema"]),
-                        include_technical_schema=True,
-                        include_table_lineage=True,
-                        include_column_lineage=False,
-                        include_views=False,
-                        include_view_lineage=False,
-                        use_legacy_lineage_method=True,
-                        include_usage_stats=False,
-                        include_operational_stats=False,
-                        start_time=datetime(2022, 6, 6, 7, 17, 0, 0).replace(
-                            tzinfo=timezone.utc
-                        ),
-                        end_time=datetime(2022, 6, 7, 7, 17, 0, 0).replace(
-                            tzinfo=timezone.utc
-                        ),
-                    ),
-                ),
-                sink=DynamicTypedConfig(
-                    type="file", config={"filename": str(output_file)}
-                ),
-            )
-        )
-        pipeline.run()
-        pipeline.pretty_print_summary()
-        pipeline.raise_from_status()
-
-        # Verify the output.
-
-        mce_helpers.check_golden_file(
-            pytestconfig,
-            output_path=output_file,
-            golden_path=golden_file,
-            ignore_paths=[],
-        )

From 01ae5d96da45a259122a547504265025624c0e11 Mon Sep 17 00:00:00 2001
From: Andrew Sikowitz <andrew.sikowitz@acryl.io>
Date: Wed, 23 Aug 2023 15:58:34 -0400
Subject: [PATCH 11/11] fix(ingest/ldap): Handle case when 'objectClass' not in
 attrs (#8658)

---
 metadata-ingestion/src/datahub/ingestion/source/ldap.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/metadata-ingestion/src/datahub/ingestion/source/ldap.py b/metadata-ingestion/src/datahub/ingestion/source/ldap.py
index 497b49acb6505..e1d035a96d42f 100644
--- a/metadata-ingestion/src/datahub/ingestion/source/ldap.py
+++ b/metadata-ingestion/src/datahub/ingestion/source/ldap.py
@@ -271,10 +271,11 @@ def get_workunits_internal(self) -> Iterable[MetadataWorkUnit]:
                 if dn is None:
                     continue
 
-                if not attrs:
+                if not attrs or "objectClass" not in attrs:
                     self.report.report_warning(
                         "<general>",
-                        f"skipping {dn} because attrs is empty; check your permissions if this is unexpected",
+                        f"skipping {dn} because attrs ({attrs}) does not contain expected data; "
+                        f"check your permissions if this is unexpected",
                     )
                     continue