Skip to content

Commit

Permalink
debug examples
Browse files Browse the repository at this point in the history
  • Loading branch information
zhuzhongshu123 committed Nov 8, 2024
1 parent c18df1f commit 40ae2bb
Show file tree
Hide file tree
Showing 32 changed files with 391 additions and 365 deletions.
2 changes: 2 additions & 0 deletions kag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,8 @@
import kag.interface
import kag.builder.component
import kag.builder.prompt
import kag.solver.prompt

from kag.common.conf import init_env

init_env()
2 changes: 1 addition & 1 deletion kag/builder/component/writer/kg_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class AlterOperationEnum(str, Enum):
Delete = "DELETE"


@SinkWriterABC.register("kg")
@SinkWriterABC.register("kg", as_default=True)
class KGWriter(SinkWriterABC):
"""
A class that extends `SinkWriter` to handle writing data into a Neo4j knowledge graph.
Expand Down
6 changes: 5 additions & 1 deletion kag/common/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,4 +138,8 @@ def init_env():
msg = "Done init config from server"
else:
msg = "Done init config from local file"
print(f"==================={msg}===================:\n{KAG_CONFIG.all_config}")

print(f"==================={msg}===================")
import pprint

pprint.pprint(KAG_CONFIG.all_config, indent=2)
1 change: 1 addition & 0 deletions kag/common/registry/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def import_modules_from_path(path: str) -> None:
User can specify their custom packages and have their custom
classes get loaded and registered.
"""
path = os.path.abspath(os.path.normpath(path))
importlib.invalidate_caches()
tmp = path.rsplit("/", 1)
if len(tmp) == 1:
Expand Down
92 changes: 33 additions & 59 deletions kag/examples/musique/builder/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,80 +8,54 @@
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied.
import json
import logging
import os
from typing import List, Type

from kag.common.registry import Registrable, import_modules_from_path
from kag.builder.component import KGWriter
from kag.builder.component.extractor import KAGExtractor
from kag.builder.component.splitter import LengthSplitter
from kag.builder.component.vectorizer.batch_vectorizer import BatchVectorizer
from kag.builder.model.chunk import Chunk
from kag.interface import ExtractorABC, SplitterABC, VectorizerABC, SourceReaderABC
from knext.builder.builder_chain_abc import BuilderChainABC
from kag.interface.builder import SourceReaderABC
from knext.common.base.runnable import Input, Output

logger = logging.getLogger(__name__)
import_modules_from_path(".")


class MusiqueBuilderChain(BuilderChainABC, Registrable):
def __init__(
self,
reader: SourceReaderABC,
splitter: SplitterABC,
extractor: ExtractorABC,
vectorizer: VectorizerABC,
writer: KGWriter,
):
self.reader = reader
self.splitter = splitter
self.extractor = extractor
self.vectorizer = vectorizer
self.writer = writer


class MusiqueCorpusReader(SourceReaderABC):
@property
def input_types(self) -> Type[Input]:
"""The type of input this Runnable object accepts specified as a type annotation."""
return str

@property
def output_types(self) -> Type[Output]:
"""The type of output this Runnable object produces specified as a type annotation."""
return Chunk

def get_basename(self, file_name: str):
base, ext = os.path.splitext(os.path.basename(file_name))
return base

def invoke(self, input: str, **kwargs) -> List[Output]:
id_column = kwargs.get("id_column", "title")
name_column = kwargs.get("name_column", "title")
content_column = kwargs.get("content_column", "text")

if os.path.exists(str(input)):
with open(input, "r") as f:
corpusList = json.load(f)
else:
corpusList = input
chunks = []

for item in corpusList:
chunk = Chunk(
id=item[id_column],
name=item[name_column],
content=item[content_column],
)
chunks.append(chunk)
return chunks


class MusiqueBuilderChain(BuilderChainABC):
def build(self, **kwargs):
source = MusiqueCorpusReader()
splitter = LengthSplitter(window_length=2000)
extractor = KAGExtractor()
vectorizer = BatchVectorizer()
sink = KGWriter()
return (
self.reader
>> self.splitter
>> self.extractor
>> self.vectorizer
>> self.writer
)

return source >> splitter >> extractor >> vectorizer >> sink

def buildKB(file_path):
from kag.common.conf import KAG_CONFIG

def buildKB(corpusFilePath):
MusiqueBuilderChain().invoke(file_path=corpusFilePath, max_workers=20)
chain_config = KAG_CONFIG.all_config["chain"]
chain = MusiqueBuilderChain.from_config(chain_config)
chain.invoke(file_path=file_path, max_workers=20)

logger.info(f"\n\nbuildKB successfully for {corpusFilePath}\n\n")


if __name__ == "__main__":
filePath = "./data/musique_sub_corpus.json"
file_path = "./data/musique_sub_corpus.json"
# filePath = "./data/musique_train_corpus.json"

corpusFilePath = os.path.join(os.path.abspath(os.path.dirname(__file__)), filePath)
buildKB(corpusFilePath)
buildKB(file_path)
16 changes: 10 additions & 6 deletions kag/examples/musique/builder/prompt/ner.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,15 @@

import json
from string import Template
from typing import List, Optional
from typing import List

from kag.common.base.prompt_op import PromptOp
from kag.common.conf import KAG_PROJECT_CONF
from kag.interface import PromptABC
from knext.schema.client import SchemaClient


class OpenIENERPrompt(PromptOp):
@PromptABC.register("musique_ner")
class OpenIENERPrompt(PromptABC):

template_en = """
{
Expand Down Expand Up @@ -85,9 +87,11 @@ class OpenIENERPrompt(PromptOp):

template_zh = template_en

def __init__(self, language: Optional[str] = "en", **kwargs):
super().__init__(language, **kwargs)
self.schema = SchemaClient(project_id=self.project_id).extract_types()
def __init__(self, language: str = ""):
super().__init__(language)
self.schema = SchemaClient(
project_id=KAG_PROJECT_CONF.project_id
).extract_types()
self.template = Template(self.template).safe_substitute(schema=self.schema)

@property
Expand Down
11 changes: 5 additions & 6 deletions kag/examples/musique/builder/prompt/std.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,14 @@
# or implied.

import json
from typing import Optional, List
from typing import List

from kag.common.base.prompt_op import PromptOp

from kag.interface import PromptABC

class OpenIEEntitystandardizationdPrompt(PromptOp):

@PromptABC.register("musique_std")
class OpenIEEntitystandardizationdPrompt(PromptABC):
template_en = """
{
"instruction": "The `input` field contains a user provided context. The `named_entities` field contains extracted named entities from the context, which may be unclear abbreviations, aliases, or slang. To eliminate ambiguity, please attempt to provide the official names of these entities based on the context and your own knowledge. Note that entities with the same meaning can only have ONE official name. Please respond in the format of a single JSONArray string without any explanation, as shown in the `output` field of the provided example.",
Expand Down Expand Up @@ -82,9 +84,6 @@ class OpenIEEntitystandardizationdPrompt(PromptOp):

template_zh = """"""

def __init__(self, language: Optional[str] = "en"):
super().__init__(language)

@property
def template_variables(self) -> List[str]:
return ["input", "named_entities"]
Expand Down
5 changes: 3 additions & 2 deletions kag/examples/musique/builder/prompt/triple.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@
import json
from typing import Optional, List

from kag.common.base.prompt_op import PromptOp
from kag.interface import PromptABC


class OpenIETriplePrompt(PromptOp):
@PromptABC.register("musique_triple")
class OpenIETriplePrompt(PromptABC):
template_en = """
{
"instruction": "You are an expert specializing in carrying out open information extraction (OpenIE). Please extract any possible relations (including subject, predicate, object) from the given text, and list them following the json format {\"triples\": [[\"subject\", \"predicate\", \"object\"]]}\n. If there are none, do not list them.\n.\n\nPay attention to the following requirements:\n- Each triple should contain at least one, but preferably two, of the named entities in the entity_list.\n- Clearly resolve pronouns to their specific names to maintain clarity.",
Expand Down
31 changes: 0 additions & 31 deletions kag/examples/musique/kag_config.cfg

This file was deleted.

49 changes: 49 additions & 0 deletions kag/examples/musique/kag_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
project:
biz_scene: default
host_addr: http://121.40.150.147:8887
id: '9'
language: en
namespace: MuSiQue
project_id: 666

llm: &extract_llm
api_key: key
base_url: https://api.deepseek.com
model: deepseek-chat
type: maas

chain:
extractor:
llm: *extract_llm
ner_prompt:
type: musique_ner
std_prompt:
type: musique_std
triple_prompt:
type: musique_triple
type: kag
reader:
type: musique
splitter:
split_length: 100000
type: length
window_length: 0
vectorizer:
type: batch
vectorizer_model:
path: ~/.cache/vectorizer/BAAI/bge-base-zh-v1.5
type: bge
vector_dimensions: 768
writer:
type: kg
indexer:
similarity_threshold: 0.8
with_semantic: false

log:
level: INFO
retriever:
match_threshold: 0.8
pagerank_threshold: 0.9
top_k: 10
with_semantic: false
2 changes: 1 addition & 1 deletion kag/examples/musique/solver/evaForMusique.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from kag.common.benchmarks.evaluate import Evaluate
from kag.examples.utils import delay_run
from kag.interface.solver.lf_planner_abc import LFPlannerABC
from kag.interface import LFPlannerABC
from kag.solver.implementation.default_kg_retrieval import KGRetrieverByLlm
from kag.solver.implementation.default_reasoner import DefaultReasoner
from kag.solver.implementation.lf_chunk_retriever import LFChunkRetriever
Expand Down
4 changes: 2 additions & 2 deletions kag/examples/musique/solver/prompt/resp_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
from typing import List
import logging

from kag.common.base.prompt_op import PromptOp
from kag.interface import PromptABC

logger = logging.getLogger(__name__)


class RespGenerator(PromptOp):
class RespGenerator(PromptABC):
template_zh = (
"基于给定的引用信息回答问题。" "\n只输出答案,不需要输出额外的信息。" "\n给定的引用信息:'$memory'\n问题:'$instruction'"
)
Expand Down
17 changes: 15 additions & 2 deletions kag/interface/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,13 @@
from kag.interface.builder.writer_abc import SinkWriterABC
from kag.interface.builder.vectorizer_abc import VectorizerABC

from knext.builder.builder_chain_abc import BuilderChainABC
from kag.interface.solver.base import KagBaseModule, Question
from kag.interface.solver.kag_generator_abc import KAGGeneratorABC
from kag.interface.solver.kag_memory_abc import KagMemoryABC
from kag.interface.solver.kag_reasoner_abc import KagReasonerABC
from kag.interface.solver.kag_reflector_abc import KagReflectorABC
from kag.interface.solver.lf_planner_abc import LFPlannerABC
from kag.interface.solver.lf_solver_abc import LFSolverABC


__all__ = [
Expand All @@ -30,5 +36,12 @@
"AlignerABC",
"SinkWriterABC",
"VectorizerABC",
"BuilderChainABC",
"KagBaseModule",
"Question",
"KAGGeneratorABC",
"KagMemoryABC",
"KagReasonerABC",
"KagReflectorABC",
"LFPlannerABC",
"LFSolverABC",
]
Loading

0 comments on commit 40ae2bb

Please sign in to comment.