Skip to content

Commit

Permalink
update readme.md
Browse files Browse the repository at this point in the history
  • Loading branch information
Alannikos committed Jan 11, 2025
1 parent 9d8acec commit 5bf1a58
Show file tree
Hide file tree
Showing 35 changed files with 3,688 additions and 0 deletions.
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@
<div align="center">

![welcome](assets/welcome.png)
<!-- ```
__ __ __ __ ___ __ __ __ __
| | |_ | / / \ |\/| |_ | / \ |_ | \ / _ |__| | | |\/|
|/\| |__ |__ \__ \__/ | | |__ | \__/ |__ |__/ \__) | |__ |__ | |
``` -->


</div>

Expand Down
7 changes: 7 additions & 0 deletions build/lib/edg4llm/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from edg4llm.core.interface import EDG4LLM

__all__ = ["EDG4LLM"]

__version__ = "1.0.14"
__author__ = "Alannikos"
__license__ = "MIT"
1 change: 1 addition & 0 deletions build/lib/edg4llm/core/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from edg4llm.core.interface import EDG4LLM
253 changes: 253 additions & 0 deletions build/lib/edg4llm/core/dataGenerators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,253 @@
import os
from typing import Dict, List, Any

from edg4llm.utils.logger import custom_logger
from edg4llm.models.chatglm import EDGChatGLM
from edg4llm.models.chatgpt import EDGChatGPT
from edg4llm.models.internlm import EDGInternLM
from edg4llm.models.deepseek import EDGDeepSeek
from edg4llm.generators.text_generators.answer_generator import AnswerGenerator
from edg4llm.generators.text_generators.question_generator import QuestionGenerator
from edg4llm.generators.text_generators.dialogue_generator import DialogueGenerator

from edg4llm.processor.preprocess import PreProcessor

logger = custom_logger("dataGenerator")

class DataGenerator:
def __init__(self, pConfig):
"""
Initialize the Data Generator
This method initializes the model and its associated generators (Answer, Question, Dialogue)
based on the provided configuration parameters.
Parameters
----------
pConfig : dict
A configuration dictionary containing the following key-value pairs:
- "model_provider" : str, optional
The type of language model to use ("chatglm", "chatgpt", "internlm", "deepseek"). Default is "chatglm".
- "model_name" : str, optional
The specific model to use within the selected provider. Default is "chatglm-4-flash".
- "base_url" : str
The base URL for the LLM API. Default is None.
- "api_key" : str
The API key for authenticating requests. Default is None.
Raises
------
ValueError
If the provided model type is not supported, raises a `ValueError`.
Attributes
----------
model : object
The selected language model instance, initialized based on the "model_provider" configuration.
answer_generator : AnswerGenerator
An instance of the AnswerGenerator to generate answers.
question_generator : QuestionGenerator
An instance of the QuestionGenerator to generate questions.
dialogue_generator : DialogueGenerator
An instance of the DialogueGenerator to generate dialogues.
Notes
-----
- Supported model providers include: "chatglm", "chatgpt", "internlm", "deepseek".
- If the "model_provider" is unsupported, a `ValueError` will be raised.
"""

if pConfig["model_provider"] == "chatglm":
self.model = EDGChatGLM(
model_name=pConfig["model_name"],
base_url=pConfig["base_url"],
api_key=pConfig["api_key"]
)
elif pConfig["model_provider"] == "chatgpt":
self.model = EDGChatGPT(
model_name=pConfig["model_name"],
base_url=pConfig["base_url"],
api_key=pConfig["api_key"]
)
elif pConfig["model_provider"] == "internlm":
self.model = EDGInternLM(
model_name=pConfig["model_name"],
base_url=pConfig["base_url"],
api_key=pConfig["api_key"]
)
elif pConfig["model_provider"] == "deepseek":
self.model = EDGDeepSeek(
model_name=pConfig["model_name"],
base_url=pConfig["base_url"],
api_key=pConfig["api_key"]
)
else:
raise ValueError("Unsupported model provider")

self.preprocessor = PreProcessor()
self.answer_generator = AnswerGenerator(self.model)
self.question_generator = QuestionGenerator(self.model)
self.dialogue_generator = DialogueGenerator(self.model)

def generate_question(self, tConfig) -> List[Dict]:
"""
Generate questions based on the given configuration.
This method uses the `question_generator` to generate question data based on
the provided configuration options. It supports various parameters to control
the question generation process, such as task type, prompts, sampling strategies, and output formatting.
Parameters
----------
tConfig : dict
A configuration dictionary containing the following key-value pairs:
- "language" : str, optional
The language of data in data generation. Must be one of 'zh', 'en'.
Default is 'zh'.
- "task_type" : str, optional
The type of task for data generation. Must be 'question' to ensure valid output.
Default is 'question'.
- "system_prompt" : str, optional
A system-level prompt to guide the question generation. Default is None.
- "user_prompt" : str, optional
A user-provided prompt to initiate the question generation. Default is None.
- "do_sample" : bool, optional
Whether to use sampling during question generation. If True, enables sampling strategies like
temperature and top_p. If False, uses deterministic decoding. Default is True.
- "temperature" : float, optional
Sampling temperature to control randomness. Must be in the range [0.0, 1.0].
Default is 0.95.
- "top_p" : float, optional
Nucleus sampling parameter for controlling randomness. Must be in the range [0.0, 1.0]. Default is 0.7.
- "max_tokens" : int, optional
The maximum number of tokens to generate in the question output. Default is 4095.
- "num_samples" : int, optional
The number of question samples to generate. Default is 10.
- "output_format" : str, optional
The format of the output, such as "alpaca" or other formats. Default is "alpaca".
Returns
-------
list of dict
A list of dictionaries containing the generated question outputs.
Notes
-----
- This method uses the `generate` method from the `question_generator` to produce question data
based on the provided configuration.
- The `tConfig` dictionary allows for flexible question generation based on task type,
system/user prompts, and various sampling strategies.
"""

tConfig["user_prompt"] = self.preprocessor.question_preprocess(tConfig["language"], tConfig["user_prompt"])

data = self.question_generator.generate(tConfig)
return data

def generate_answer(self, tConfig) -> List[Dict]:
"""
Generate answers based on the given configuration.
This method uses the `answer_generator` to generate answer data based on
the provided configuration options. It supports various parameters to control
the answer generation process, such as task type, prompts, sampling strategies, and output formatting.
Parameters
----------
tConfig : dict
A configuration dictionary containing the following key-value pairs:
- "language" : str, optional
The language of data in data generation. Must be one of 'zh', 'en'.
Default is 'zh'.
- "task_type" : str, optional
The type of task for data generation. Must be 'answer' to ensure valid output.
Default is 'answer'.
- "system_prompt" : str, optional
A system-level prompt to guide the answer generation. Default is None.
- "user_prompt" : str, optional
A user-provided prompt to initiate the answer generation. Default is None.
- "do_sample" : bool, optional
Whether to use sampling during answer generation. If True, enables sampling strategies like
temperature and top_p. If False, uses deterministic decoding. Default is True.
- "temperature" : float, optional
Sampling temperature to control randomness. Must be in the range [0.0, 1.0].
Default is 0.95.
- "top_p" : float, optional
Nucleus sampling parameter for controlling randomness. Must be in the range [0.0, 1.0]. Default is 0.7.
- "max_tokens" : int, optional
The maximum number of tokens to generate in the answer output. Default is 4095.
- "num_samples" : int, optional
The number of answer samples to generate. Default is 10.
- "output_format" : str, optional
The format of the output, such as "json" or other formats. Default is "json".
Returns
-------
list of dict
A list of dictionaries containing the generated answer outputs.
Notes
-----
- This method uses the `generate` method from the `answer_generator` to produce answer data
based on the provided configuration.
- The `tConfig` dictionary allows for flexible answer generation based on task type,
system/user prompts, and various sampling strategies.
"""

tConfig["user_prompt"] = self.preprocessor.answer_preprocess(tConfig["language"], tConfig["user_prompt"])
data = self.answer_generator.generate(tConfig)
return data

def generate_dialogue(self, tConfig) -> List[Dict]:
"""
Generate a dialogue based on the given configuration.
This method utilizes the `dialogue_generator` to generate dialogues using the
provided configuration options. It supports various parameters to control
the text generation process, such as task type, prompts, sampling strategies, and output formatting.
Parameters
----------
tConfig : dict
A configuration dictionary containing the following key-value pairs:
- "language" : str, optional
The language of data in data generation. Must be one of 'zh', 'en'.
Default is 'zh'.
- "task_type" : str, optional
The type of task for data generation. Must be one of 'question', 'answer', or 'dialogue'.
Default is 'dialogue'.
- "system_prompt" : str, optional
A system-level prompt to guide the text generation. Default is None.
- "user_prompt" : str, optional
A user-provided prompt to initiate the text generation. Default is None.
- "do_sample" : bool, optional
Whether to use sampling during text generation. If True, enables sampling strategies like temperature
and top_p. If False, uses deterministic decoding. Default is True.
- "temperature" : float, optional
Sampling temperature to control randomness. Must be in the range [0.0, 1.0].
Default is 0.95.
- "top_p" : float, optional
Nucleus sampling parameter for controlling randomness. Must be in the range [0.0, 1.0]. Default is 0.7.
- "max_tokens" : int, optional
The maximum number of tokens to generate in the output. Default is 4095.
- "num_samples" : int, optional
The number of output samples to generate. Default is 10.
- "output_format" : str, optional
The format of the output. Default is "alpaca".
Returns
-------
list of dict
A list of dictionaries containing the generated dialogue outputs.
Notes
-----
- This method uses the `generate` method from the `dialogue_generator` to produce dialogue outputs
based on the provided configuration.
- The `tConfig` dictionary allows for flexible generation based on task type, system/user prompts,
and various sampling strategies.
"""

tConfig["user_prompt"] = self.preprocessor.dialogue_preprocess(tConfig["language"], tConfig["user_prompt"])
data = self.dialogue_generator.generate(tConfig)
return data
Loading

0 comments on commit 5bf1a58

Please sign in to comment.