Merge branch '0.6_dev' into patch-1

eltociear · Nov 15, 2024 · 83f255c · 83f255c
2 parents 34146f8 + c7e8745
commit 83f255c
Show file tree

Hide file tree

Showing 232 changed files with 8,259 additions and 4,091 deletions.
diff --git a/.github/workflows/cloud-code-scan.yml b/.github/workflows/cloud-code-scan.yml
@@ -0,0 +1,22 @@
+name: Alipay Cloud Devops Codescan
+on:
+  pull_request_target:
+jobs:
+  stc:
+    runs-on: ubuntu-latest
+    steps:
+      - name: codeScan
+        uses: layotto/alipay-cloud-devops-codescan@main
+        with:
+          parent_uid: ${{ secrets.ALI_PID }}
+          private_key: ${{ secrets.ALI_PK }}
+          scan_type: stc
+  sca:
+    runs-on: ubuntu-latest
+    steps:
+      - name: codeScan
+        uses: layotto/alipay-cloud-devops-codescan@main
+        with:
+          parent_uid: ${{ secrets.ALI_PID }}
+          private_key: ${{ secrets.ALI_PK }}
+          scan_type: sca
diff --git a/.github/workflows/code-format-check.yml b/.github/workflows/code-format-check.yml
@@ -0,0 +1,28 @@
+name: Code Format Check
+
+on:
+  push:
+  pull_request:
+  workflow_dispatch:
+  repository_dispatch:
+    types: [my_event]
+jobs:
+  format-check:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pre-commit
+
+      - name: Run pre-commit
+        run: pre-commit run --all-files
diff --git a/.github/workflows/license-checker.yml b/.github/workflows/license-checker.yml
@@ -0,0 +1,25 @@
+name: License Checker
+
+on:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  check:
+    name: "License Validation"
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Check License Header
+        uses: apache/skywalking-eyes@main
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          log: info
+      - name: Check Dependencies' License
+        uses: apache/skywalking-eyes/dependency@main
diff --git a/.github/workflows/pr-title-check.yml b/.github/workflows/pr-title-check.yml
@@ -0,0 +1,28 @@
+name: "Lint PR"
+
+on:
+  pull_request_target:
+    types:
+      - opened
+      - edited
+      - synchronize
+
+jobs:
+  main:
+    name: Validate PR title
+    runs-on: ubuntu-latest
+    steps:
+      # https://www.conventionalcommits.org/en/v1.0.0/#summary
+      - uses: amannn/action-semantic-pull-request@v5
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          requireScope: true
+          subjectPattern: ^(?![A-Z]).+$
+          # If `subjectPattern` is configured, you can use this property to override
+          # the default error message that is shown when the pattern doesn't match.
+          # The variables `subject` and `title` can be used within the message.
+          subjectPatternError: |
+            The subject "{subject}" found in the pull request title "{title}"
+            didn't match the configured pattern. Please ensure that the subject
+            doesn't start with an uppercase character.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,12 @@
+repos:
+  - repo: https://github.com/psf/black
+    rev: 22.3.0
+    hooks:
+      - id: black
+        files: ^kag/.*\.py$
+  - repo: https://github.com/pycqa/flake8
+    rev: 4.0.1
+    hooks:
+      - id: flake8
+        files: ^kag/.*\.py$
+
diff --git a/KAG_VERSION b/KAG_VERSION
@@ -1 +1 @@
-0.5-beta1
+0.5.2-beta1
diff --git a/kag/__init__.py b/kag/__init__.py
@@ -202,7 +202,7 @@
 
 
 __package_name__ = "openspg-kag"
-__version__ = "0.5-beta1"
+__version__ = "0.5.2-beta1"
 
 from kag.common.env import init_env
 

diff --git a/kag/builder/component/aligner/__init__.py b/kag/builder/component/aligner/__init__.py
@@ -9,4 +9,3 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
-
diff --git a/kag/builder/component/base.py b/kag/builder/component/base.py
@@ -10,22 +10,28 @@
 # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
 # or implied.
 import os
-from abc import ABC
 from typing import List, Dict
 import logging
 
 from knext.common.base.component import Component
 from knext.common.base.runnable import Input, Output
 from knext.project.client import ProjectClient
-from kag.common.llm.client import LLMClient
+from kag.common.llm import LLMClient
+from kag.common.registry import Registrable
 
 
-class BuilderComponent(Component, ABC):
+@Registrable.register("builder")
+class BuilderComponent(Component, Registrable):
     """
     Abstract base class for all builder component.
     """
 
-    project_id: str = None
+    def __init__(self, project_id: int = None, **kwargs):
+        super().__init__(**kwargs)
+        if project_id is None:
+            project_id = int(os.getenv("KAG_PROJECT_ID"))
+        self.project_id = project_id
+        self.config = ProjectClient().get_config(self.project_id)
 
     def _init_llm(self) -> LLMClient:
         """
@@ -48,9 +54,9 @@ def _init_llm(self) -> LLMClient:
             try:
                 config = ProjectClient().get_config(project_id)
                 llm_config.update(config.get("llm", {}))
-            except:
+            except Exception as e:
                 logging.warning(
-                    f"Failed to get project config for project id: {project_id}"
+                    f"Failed to get project config for project id: {project_id}, info: {e}"
                 )
         llm = LLMClient.from_config(llm_config)
         return llm

diff --git a/kag/builder/component/extractor/kag_extractor.py b/kag/builder/component/extractor/kag_extractor.py
@@ -39,10 +39,17 @@ class KAGExtractor(ExtractorABC):
     def __init__(self, **kwargs):
         super().__init__(**kwargs)
         self.llm = self._init_llm()
-        self.biz_scene = os.getenv("KAG_PROMPT_BIZ_SCENE", "default")
-        self.language = os.getenv("KAG_PROMPT_LANGUAGE", "en")
+        self.prompt_config = self.config.get("prompt", {})
+        self.biz_scene = self.prompt_config.get("biz_scene") or os.getenv(
+            "KAG_PROMPT_BIZ_SCENE", "default"
+        )
+        self.language = self.prompt_config.get("language") or os.getenv(
+            "KAG_PROMPT_LANGUAGE", "en"
+        )
         self.schema = SchemaClient(project_id=self.project_id).load()
-        self.ner_prompt = PromptOp.load(self.biz_scene, "ner")(language=self.language, project_id=self.project_id)
+        self.ner_prompt = PromptOp.load(self.biz_scene, "ner")(
+            language=self.language, project_id=self.project_id
+        )
         self.std_prompt = PromptOp.load(self.biz_scene, "std")(language=self.language)
         self.triple_prompt = PromptOp.load(self.biz_scene, "triple")(
             language=self.language
@@ -59,7 +66,9 @@ def __init__(self, **kwargs):
                     self.kg_types.append(type_name)
                     break
         if self.kg_types:
-            self.kg_prompt = SPG_KGPrompt(self.kg_types, language=self.language, project_id=self.project_id)
+            self.kg_prompt = SPG_KGPrompt(
+                self.kg_types, language=self.language, project_id=self.project_id
+            )
 
     @property
     def input_types(self) -> Type[Input]:
@@ -129,17 +138,26 @@ def assemble_sub_graph_with_spg_records(self, entities: List[Dict]):
                     continue
                 if prop_name in spg_type.properties:
                     from knext.schema.model.property import Property
+
                     prop: Property = spg_type.properties.get(prop_name)
                     o_label = prop.object_type_name_en
                     if o_label not in BASIC_TYPES:
                         if isinstance(prop_value, str):
                             prop_value = [prop_value]
                         for o_name in prop_value:
                             sub_graph.add_node(id=o_name, name=o_name, label=o_label)
-                            sub_graph.add_edge(s_id=s_name, s_label=s_label, p=prop_name, o_id=o_name, o_label=o_label)
+                            sub_graph.add_edge(
+                                s_id=s_name,
+                                s_label=s_label,
+                                p=prop_name,
+                                o_id=o_name,
+                                o_label=o_label,
+                            )
                         tmp_properties.pop(prop_name)
             record["properties"] = tmp_properties
-            sub_graph.add_node(id=s_name, name=s_name, label=s_label, properties=properties)
+            sub_graph.add_node(
+                id=s_name, name=s_name, label=s_label, properties=properties
+            )
         return sub_graph, entities
 
     @staticmethod
@@ -173,10 +191,9 @@ def get_category(entities_data, entity_name):
             if o_category is None:
                 o_category = OTHER_TYPE
                 sub_graph.add_node(tri[2], tri[2], o_category)
-
-            sub_graph.add_edge(
-                tri[0], s_category, to_camel_case(tri[1]), tri[2], o_category
-            )
+            edge_type = to_camel_case(tri[1])
+            if edge_type:
+                sub_graph.add_edge(tri[0], s_category, edge_type, tri[2], o_category)
 
         return sub_graph
 
@@ -198,14 +215,18 @@ def assemble_sub_graph_with_chunk(sub_graph: SubGraph, chunk: Chunk):
                 "id": chunk.id,
                 "name": chunk.name,
                 "content": f"{chunk.name}\n{chunk.content}",
-                **chunk.kwargs
+                **chunk.kwargs,
             },
         )
         sub_graph.id = chunk.id
         return sub_graph
 
     def assemble_sub_graph(
-        self, sub_graph: SubGraph, chunk: Chunk, entities: List[Dict], triples: List[list]
+        self,
+        sub_graph: SubGraph,
+        chunk: Chunk,
+        entities: List[Dict],
+        triples: List[list],
     ):
         """
         Integrates entity and triple information into a subgraph, and associates it with a chunk of text.
@@ -310,7 +331,10 @@ def invoke(self, input: Input, **kwargs) -> List[Output]:
         try:
             entities = self.named_entity_recognition(passage)
             sub_graph, entities = self.assemble_sub_graph_with_spg_records(entities)
-            filtered_entities = [{k: v for k, v in ent.items() if k in ["entity", "category"]} for ent in entities]
+            filtered_entities = [
+                {k: v for k, v in ent.items() if k in ["entity", "category"]}
+                for ent in entities
+            ]
             triples = self.triples_extraction(passage, filtered_entities)
             std_entities = self.named_entity_standardization(passage, filtered_entities)
             self.append_official_name(entities, std_entities)

diff --git a/kag/builder/component/extractor/spg_extractor.py b/kag/builder/component/extractor/spg_extractor.py
@@ -42,7 +42,9 @@ def __init__(self, **kwargs):
                     self.spg_ner_types.append(type_name)
                     continue
             self.kag_ner_types.append(type_name)
-        self.kag_ner_prompt = PromptOp.load(self.biz_scene, "ner")(language=self.language, project_id=self.project_id)
+        self.kag_ner_prompt = PromptOp.load(self.biz_scene, "ner")(
+            language=self.language, project_id=self.project_id
+        )
         self.spg_ner_prompt = SPG_KGPrompt(self.spg_ner_types, self.language)
 
     @retry(stop=stop_after_attempt(3))
@@ -72,17 +74,26 @@ def assemble_sub_graph_with_spg_records(self, entities: List[Dict]):
                     continue
                 if prop_name in spg_type.properties:
                     from knext.schema.model.property import Property
+
                     prop: Property = spg_type.properties.get(prop_name)
                     o_label = prop.object_type_name_en
                     if o_label not in BASIC_TYPES:
                         if isinstance(prop_value, str):
                             prop_value = [prop_value]
                         for o_name in prop_value:
                             sub_graph.add_node(id=o_name, name=o_name, label=o_label)
-                            sub_graph.add_edge(s_id=s_name, s_label=s_label, p=prop_name, o_id=o_name, o_label=o_label)
+                            sub_graph.add_edge(
+                                s_id=s_name,
+                                s_label=s_label,
+                                p=prop_name,
+                                o_id=o_name,
+                                o_label=o_label,
+                            )
                         tmp_properties.pop(prop_name)
             record["properties"] = tmp_properties
-            sub_graph.add_node(id=s_name, name=s_name, label=s_label, properties=properties)
+            sub_graph.add_node(
+                id=s_name, name=s_name, label=s_label, properties=properties
+            )
         return sub_graph, entities
 
     def invoke(self, input: Input, **kwargs) -> List[Output]:
@@ -102,7 +113,10 @@ def invoke(self, input: Input, **kwargs) -> List[Output]:
         try:
             entities = self.named_entity_recognition(passage)
             sub_graph, entities = self.assemble_sub_graph_with_spg_records(entities)
-            filtered_entities = [{k: v for k, v in ent.items() if k in ["entity", "category"]} for ent in entities]
+            filtered_entities = [
+                {k: v for k, v in ent.items() if k in ["entity", "category"]}
+                for ent in entities
+            ]
             triples = self.triples_extraction(passage, filtered_entities)
             std_entities = self.named_entity_standardization(passage, filtered_entities)
             self.append_official_name(entities, std_entities)

diff --git a/kag/builder/component/mapping/relation_mapping.py b/kag/builder/component/mapping/relation_mapping.py
@@ -40,7 +40,7 @@ def __init__(
         subject_name: SPGTypeName,
         predicate_name: RelationName,
         object_name: SPGTypeName,
-        **kwargs
+        **kwargs,
     ):
         super().__init__(**kwargs)
         schema = SchemaClient(project_id=self.project_id).load()

diff --git a/kag/builder/component/mapping/spg_type_mapping.py b/kag/builder/component/mapping/spg_type_mapping.py
@@ -39,7 +39,9 @@ class SPGTypeMapping(MappingABC):
         fuse_op (FuseOpABC, optional): The user-defined fuse operator. Defaults to None.
     """
 
-    def __init__(self, spg_type_name: SPGTypeName, fuse_func: FuseFunc = None, **kwargs):
+    def __init__(
+        self, spg_type_name: SPGTypeName, fuse_func: FuseFunc = None, **kwargs
+    ):
         super().__init__(**kwargs)
         self.schema = SchemaClient(project_id=self.project_id).load()
         assert (
Original file line number	Diff line number	Diff line change
Expand Up		@@ -9,4 +9,3 @@
		# Unless required by applicable law or agreed to in writing, software distributed under the License
		# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
		# or implied.