diff --git a/README.md b/README.md index 5bc9f17..edd6e05 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,13 @@
![welcome](assets/welcome.png) + +
diff --git a/build/lib/edg4llm/__init__.py b/build/lib/edg4llm/__init__.py new file mode 100644 index 0000000..1cdaf9f --- /dev/null +++ b/build/lib/edg4llm/__init__.py @@ -0,0 +1,7 @@ +from edg4llm.core.interface import EDG4LLM + +__all__ = ["EDG4LLM"] + +__version__ = "1.0.14" +__author__ = "Alannikos" +__license__ = "MIT" diff --git a/build/lib/edg4llm/core/__init__.py b/build/lib/edg4llm/core/__init__.py new file mode 100644 index 0000000..3f40e95 --- /dev/null +++ b/build/lib/edg4llm/core/__init__.py @@ -0,0 +1 @@ +from edg4llm.core.interface import EDG4LLM diff --git a/build/lib/edg4llm/core/dataGenerators.py b/build/lib/edg4llm/core/dataGenerators.py new file mode 100644 index 0000000..12bb8cf --- /dev/null +++ b/build/lib/edg4llm/core/dataGenerators.py @@ -0,0 +1,253 @@ +import os +from typing import Dict, List, Any + +from edg4llm.utils.logger import custom_logger +from edg4llm.models.chatglm import EDGChatGLM +from edg4llm.models.chatgpt import EDGChatGPT +from edg4llm.models.internlm import EDGInternLM +from edg4llm.models.deepseek import EDGDeepSeek +from edg4llm.generators.text_generators.answer_generator import AnswerGenerator +from edg4llm.generators.text_generators.question_generator import QuestionGenerator +from edg4llm.generators.text_generators.dialogue_generator import DialogueGenerator + +from edg4llm.processor.preprocess import PreProcessor + +logger = custom_logger("dataGenerator") + +class DataGenerator: + def __init__(self, pConfig): + """ + Initialize the Data Generator + + This method initializes the model and its associated generators (Answer, Question, Dialogue) + based on the provided configuration parameters. + + Parameters + ---------- + pConfig : dict + A configuration dictionary containing the following key-value pairs: + - "model_provider" : str, optional + The type of language model to use ("chatglm", "chatgpt", "internlm", "deepseek"). Default is "chatglm". + - "model_name" : str, optional + The specific model to use within the selected provider. Default is "chatglm-4-flash". + - "base_url" : str + The base URL for the LLM API. Default is None. + - "api_key" : str + The API key for authenticating requests. Default is None. + + Raises + ------ + ValueError + If the provided model type is not supported, raises a `ValueError`. + + Attributes + ---------- + model : object + The selected language model instance, initialized based on the "model_provider" configuration. + answer_generator : AnswerGenerator + An instance of the AnswerGenerator to generate answers. + question_generator : QuestionGenerator + An instance of the QuestionGenerator to generate questions. + dialogue_generator : DialogueGenerator + An instance of the DialogueGenerator to generate dialogues. + + Notes + ----- + - Supported model providers include: "chatglm", "chatgpt", "internlm", "deepseek". + - If the "model_provider" is unsupported, a `ValueError` will be raised. + """ + + if pConfig["model_provider"] == "chatglm": + self.model = EDGChatGLM( + model_name=pConfig["model_name"], + base_url=pConfig["base_url"], + api_key=pConfig["api_key"] + ) + elif pConfig["model_provider"] == "chatgpt": + self.model = EDGChatGPT( + model_name=pConfig["model_name"], + base_url=pConfig["base_url"], + api_key=pConfig["api_key"] + ) + elif pConfig["model_provider"] == "internlm": + self.model = EDGInternLM( + model_name=pConfig["model_name"], + base_url=pConfig["base_url"], + api_key=pConfig["api_key"] + ) + elif pConfig["model_provider"] == "deepseek": + self.model = EDGDeepSeek( + model_name=pConfig["model_name"], + base_url=pConfig["base_url"], + api_key=pConfig["api_key"] + ) + else: + raise ValueError("Unsupported model provider") + + self.preprocessor = PreProcessor() + self.answer_generator = AnswerGenerator(self.model) + self.question_generator = QuestionGenerator(self.model) + self.dialogue_generator = DialogueGenerator(self.model) + + def generate_question(self, tConfig) -> List[Dict]: + """ + Generate questions based on the given configuration. + + This method uses the `question_generator` to generate question data based on + the provided configuration options. It supports various parameters to control + the question generation process, such as task type, prompts, sampling strategies, and output formatting. + + Parameters + ---------- + tConfig : dict + A configuration dictionary containing the following key-value pairs: + - "language" : str, optional + The language of data in data generation. Must be one of 'zh', 'en'. + Default is 'zh'. + - "task_type" : str, optional + The type of task for data generation. Must be 'question' to ensure valid output. + Default is 'question'. + - "system_prompt" : str, optional + A system-level prompt to guide the question generation. Default is None. + - "user_prompt" : str, optional + A user-provided prompt to initiate the question generation. Default is None. + - "do_sample" : bool, optional + Whether to use sampling during question generation. If True, enables sampling strategies like + temperature and top_p. If False, uses deterministic decoding. Default is True. + - "temperature" : float, optional + Sampling temperature to control randomness. Must be in the range [0.0, 1.0]. + Default is 0.95. + - "top_p" : float, optional + Nucleus sampling parameter for controlling randomness. Must be in the range [0.0, 1.0]. Default is 0.7. + - "max_tokens" : int, optional + The maximum number of tokens to generate in the question output. Default is 4095. + - "num_samples" : int, optional + The number of question samples to generate. Default is 10. + - "output_format" : str, optional + The format of the output, such as "alpaca" or other formats. Default is "alpaca". + + Returns + ------- + list of dict + A list of dictionaries containing the generated question outputs. + + Notes + ----- + - This method uses the `generate` method from the `question_generator` to produce question data + based on the provided configuration. + - The `tConfig` dictionary allows for flexible question generation based on task type, + system/user prompts, and various sampling strategies. + """ + + tConfig["user_prompt"] = self.preprocessor.question_preprocess(tConfig["language"], tConfig["user_prompt"]) + + data = self.question_generator.generate(tConfig) + return data + + def generate_answer(self, tConfig) -> List[Dict]: + """ + Generate answers based on the given configuration. + + This method uses the `answer_generator` to generate answer data based on + the provided configuration options. It supports various parameters to control + the answer generation process, such as task type, prompts, sampling strategies, and output formatting. + + Parameters + ---------- + tConfig : dict + A configuration dictionary containing the following key-value pairs: + - "language" : str, optional + The language of data in data generation. Must be one of 'zh', 'en'. + Default is 'zh'. + - "task_type" : str, optional + The type of task for data generation. Must be 'answer' to ensure valid output. + Default is 'answer'. + - "system_prompt" : str, optional + A system-level prompt to guide the answer generation. Default is None. + - "user_prompt" : str, optional + A user-provided prompt to initiate the answer generation. Default is None. + - "do_sample" : bool, optional + Whether to use sampling during answer generation. If True, enables sampling strategies like + temperature and top_p. If False, uses deterministic decoding. Default is True. + - "temperature" : float, optional + Sampling temperature to control randomness. Must be in the range [0.0, 1.0]. + Default is 0.95. + - "top_p" : float, optional + Nucleus sampling parameter for controlling randomness. Must be in the range [0.0, 1.0]. Default is 0.7. + - "max_tokens" : int, optional + The maximum number of tokens to generate in the answer output. Default is 4095. + - "num_samples" : int, optional + The number of answer samples to generate. Default is 10. + - "output_format" : str, optional + The format of the output, such as "json" or other formats. Default is "json". + + Returns + ------- + list of dict + A list of dictionaries containing the generated answer outputs. + + Notes + ----- + - This method uses the `generate` method from the `answer_generator` to produce answer data + based on the provided configuration. + - The `tConfig` dictionary allows for flexible answer generation based on task type, + system/user prompts, and various sampling strategies. + """ + + tConfig["user_prompt"] = self.preprocessor.answer_preprocess(tConfig["language"], tConfig["user_prompt"]) + data = self.answer_generator.generate(tConfig) + return data + + def generate_dialogue(self, tConfig) -> List[Dict]: + """ + Generate a dialogue based on the given configuration. + + This method utilizes the `dialogue_generator` to generate dialogues using the + provided configuration options. It supports various parameters to control + the text generation process, such as task type, prompts, sampling strategies, and output formatting. + + Parameters + ---------- + tConfig : dict + A configuration dictionary containing the following key-value pairs: + - "language" : str, optional + The language of data in data generation. Must be one of 'zh', 'en'. + Default is 'zh'. + - "task_type" : str, optional + The type of task for data generation. Must be one of 'question', 'answer', or 'dialogue'. + Default is 'dialogue'. + - "system_prompt" : str, optional + A system-level prompt to guide the text generation. Default is None. + - "user_prompt" : str, optional + A user-provided prompt to initiate the text generation. Default is None. + - "do_sample" : bool, optional + Whether to use sampling during text generation. If True, enables sampling strategies like temperature + and top_p. If False, uses deterministic decoding. Default is True. + - "temperature" : float, optional + Sampling temperature to control randomness. Must be in the range [0.0, 1.0]. + Default is 0.95. + - "top_p" : float, optional + Nucleus sampling parameter for controlling randomness. Must be in the range [0.0, 1.0]. Default is 0.7. + - "max_tokens" : int, optional + The maximum number of tokens to generate in the output. Default is 4095. + - "num_samples" : int, optional + The number of output samples to generate. Default is 10. + - "output_format" : str, optional + The format of the output. Default is "alpaca". + + Returns + ------- + list of dict + A list of dictionaries containing the generated dialogue outputs. + + Notes + ----- + - This method uses the `generate` method from the `dialogue_generator` to produce dialogue outputs + based on the provided configuration. + - The `tConfig` dictionary allows for flexible generation based on task type, system/user prompts, + and various sampling strategies. + """ + + tConfig["user_prompt"] = self.preprocessor.dialogue_preprocess(tConfig["language"], tConfig["user_prompt"]) + data = self.dialogue_generator.generate(tConfig) + return data \ No newline at end of file diff --git a/build/lib/edg4llm/core/interface.py b/build/lib/edg4llm/core/interface.py new file mode 100644 index 0000000..64fe88d --- /dev/null +++ b/build/lib/edg4llm/core/interface.py @@ -0,0 +1,333 @@ +""" +EDG4LLM: A Comprehensive Interface for Text Generation with Configurable LLMs + +Overview +-------- +The EDG4LLM class serves as a high-level interface for generating text using a language model pipeline. +It supports configuration for task types, prompts, sampling strategies, and output formats, making it versatile +and adaptable to various use cases. + +Key Features +------------ +- Task Flexibility: Supports task types such as 'dialogue', 'question', and 'answer'. +- Custom Prompts: Allows system-level and user-level prompts to guide the generation process. +- Sampling Controls: Provides options to customize randomness and diversity of outputs using + parameters like `do_sample`, `temperature`, and `top_p`. +- Output Formats: Compatible with customizable output formats, such as "alpaca". +""" + + + +import os +from typing import Any, Tuple, Dict + +from edg4llm.utils.logger import custom_logger +from edg4llm.core.pipeline import DataPipeline + +logger = custom_logger("interface") + + +class EDG4LLM: + """ + EDG4LLM: A Class for Configurable Text Generation with LLMs + + This class provides an interface for generating text using a configurable language model pipeline. + It allows users to specify a variety of parameters, including model type, prompts, sampling strategies, + and output formats. + + Attributes + ---------- + pipeline : DataPipeline + An instance of the `DataPipeline` class, used to handle the data processing + and interaction with the language model. + + Methods + ------- + __init__(model_provider: str = "chatglm", model_name: str = "chatglm-4-flash", base_url: str = None, api_key: str = None): + Initializes the EDG4LLM instance with the model type, base URL, and API key. + + generate(task_type: str = 'dialogue', system_prompt: str = None, user_prompt: str = None, + do_sample: bool = True, temperature: float = 0.95, top_p: float = 0.7, + max_tokens: int = 4095, num_samples: int = 10, output_format: str = "alpaca") -> List[Dict]: + Generates text data based on the provided configuration. + + Notes + ----- + - This class leverages the `DataPipeline` for all interactions with the language model. + - The `generate` method is user-facing. + - Supports customization for tasks like 'dialogue', 'question', and 'answer'. + - Ensures compatibility with different output formats (e.g., "alpaca"). + + Examples + -------- + >>> # Create an instance of EDG4LLM + >>> generator = EDG4LLM(model_provider="chatglm", model_name="chatglm-4-flash", base_url="https://api.example.com", api_key="your_api_key") + + >>> # Generate a dialogue response + >>> response = generator.generate( + task_type="answer", + system_prompt="You are a helpful assistant.", + user_prompt="What is the weather today?", + max_tokens=100 + ) + + >>> print(response) + Output: [{'output': 'The weather today is sunny with a high of 25°C.'}] + """ + def __init__(self, + model_provider: str = "chatglm", + model_name: str = "chatglm-4-flash", + base_url: str = None, + api_key: str = None): + """ + Initialize the EDG4LLM instance with the necessary parameters. + + Parameters + ---------- + model_provider: str, optional + The type of language model to use, by default "chatglm". + model_name : str, optional + The specific model to use within the model type, by default "chatglm-4-flash". + base_url : str, optional + The base URL of the LLM API, by default None. + api_key : str, optional + The API key for authenticating requests, by default None. + """ + + self._pConfig = { + "model_provider": model_provider + ,"model_name" : model_name + , "base_url": base_url + , "api_key" : api_key + } + + self.pipeline = DataPipeline(self._pConfig) + logger.info("DataPipeline initialized successfully with the provided configuration.") + + def generate(self + , language: str = 'zh' + , task_type: str = 'dialogue' + , system_prompt: str = None + , user_prompt: str = None + , do_sample: bool = True + , temperature: float = 0.95 + , top_p: float = 0.7 + , max_tokens: int = 4095 + , num_samples: int = 10 + , output_format: str = "alpaca" + , question_path: str = None + ): + """ + Generate text data based on the specified configuration. + + Parameters + ---------- + language : str, optional + The language of data in data generation. Must be one of 'zh', 'en'. + Default is 'zh'. + + task_type : str, optional + The type of task for data generation. Must be one of 'question', 'answer', or 'dialogue'. + Default is 'dialogue'. + + system_prompt : str, optional + A system-level prompt to guide the text generation. + Default is None. + + user_prompt : str, optional + A user-provided prompt to initiate the text generation. + Default is None. + + do_sample : bool, optional + Whether to use sampling during text generation. + - If True, enables sampling strategies like temperature and top_p. + - If False, uses deterministic decoding (e.g., greedy decoding), and + `temperature` and `top_p` are ignored. + Default is True. + + temperature : float, optional + Sampling temperature to control randomness. + - Must be a positive number in the range [0.0, 1.0]. + - Higher values produce more diverse outputs, while lower values make + the output more focused and deterministic. + Default is 0.95. + + top_p : float, optional + Nucleus sampling parameter for controlling randomness. + - Limits token selection to the top cumulative probability range + defined by p. + - Must be in the range [0.0, 1.0]. + Default is 0.7. + + max_tokens : int, optional + The maximum number of tokens to generate in the output. + - Default: 4095. + - Maximum allowed value: 4095 (values exceeding this will be capped). + + num_samples : int, optional + The number of output samples to generate. + Default is 10. + + output_format : str, optional + The format of the output. + Default is "alpaca". + + question_path : str, optional + The path to a file containing a list of questions. + - Only applicable when `task_type` is set to 'answer'. + - The model will read the file and generate answers for each question in the file. + - The output will be returned in a specific format as defined by the `output_format` parameter. + Default is None. + + Returns + ------- + list of dict + A list of dictionaries containing the generated outputs. + + Examples + -------- + >>> # Create an instance of EDG4LLM + >>> generator = EDG4LLM(model_provider="chatglm", model_name="chatglm-4-flash", base_url="https://api.example.com", api_key="your_api_key") + + >>> # Generate a dialogue response + >>> response = generator.generate( + task_type="answer", + system_prompt="You are a helpful assistant.", + user_prompt="What is the weather today?", + max_tokens=100 + ) + + >>> print(response) + Output: [{'output': 'The weather today is sunny with a high of 25°C.'}] + + Notes + ----- + The method will use a pipeline's `generate_data` function to create outputs + based on the provided configuration. + """ + + data = self._generate(language, task_type, system_prompt, user_prompt, do_sample, temperature, top_p, max_tokens, num_samples, output_format, question_path) + logger.info("Data generation completed successfully for task_type: %s", task_type) + + return data + + def _generate(self, + language: str = 'zh', + task_type: str = 'dialogue', + system_prompt: str = None, + user_prompt: str = None, + do_sample: bool = True, + temperature: float = 0.95, + top_p: float = 0.7, + max_tokens: int = 4095, + num_samples: int = 10, + output_format: str = "alpaca", + question_path: str = None + ): + """ + Generate text data based on the specified configuration. + + Parameters + ---------- + language : str, optional + The language of data in data generation. Must be one of 'zh', 'en'. + Default is 'zh'. + + task_type : str, optional + The type of task for data generation. Must be one of 'question', 'answer', or 'dialogue'. + Default is 'dialogue'. + + system_prompt : str, optional + A system-level prompt to guide the text generation. + Default is None. + + user_prompt : str, optional + A user-provided prompt to initiate the text generation. + Default is None. + + do_sample : bool, optional + Whether to use sampling during text generation. + - If True, enables sampling strategies like temperature and top_p. + - If False, uses deterministic decoding (e.g., greedy decoding), and + `temperature` and `top_p` are ignored. + Default is True. + + temperature : float, optional + Sampling temperature to control randomness. + - Must be a positive number in the range [0.0, 1.0]. + - Higher values produce more diverse outputs, while lower values make + the output more focused and deterministic. + Default is 0.95. + + top_p : float, optional + Nucleus sampling parameter for controlling randomness. + - Limits token selection to the top cumulative probability range + defined by p. + - Must be in the range [0.0, 1.0]. + Default is 0.7. + + max_tokens : int, optional + The maximum number of tokens to generate in the output. + - Default: 4095. + - Maximum allowed value: 4095 (values exceeding this will be capped). + + num_samples : int, optional + The number of output samples to generate. + Default is 10. + + output_format : str, optional + The format of the output. + Default is "alpaca". + + question_path : str, optional + The path to a file containing a list of questions. + - Only applicable when `task_type` is set to 'answer'. + - The model will read the file and generate answers for each question in the file. + - The output will be returned in a specific format as defined by the `output_format` parameter. + Default is None. + + Returns + ------- + list of dict + A list of dictionaries containing the generated outputs. + + Examples + -------- + >>> # Create an instance of EDG4LLM + >>> generator = EDG4LLM(model_provider="chatglm", model_name="chatglm-4-flash", base_url="https://api.example.com", api_key="your_api_key") + + >>> # Generate a dialogue response + >>> response = generator.generate( + task_type="answer", + system_prompt="You are a helpful assistant.", + user_prompt="What is the weather today?", + max_tokens=100 + ) + + >>> print(response) + Output: [{'output': 'The weather today is sunny with a high of 25°C.'}] + + Notes + ----- + The method will use a pipeline's `generate_data` function to create outputs + based on the provided configuration. + """ + + self._tConfig = { + "language": language, + "task_type": task_type, # The type of task for data generation + "system_prompt": system_prompt, # The system-level prompt + "user_prompt": user_prompt, # The user-provided prompt + "do_sample": do_sample, # Whether to use sampling + "temperature": temperature, # Sampling temperature + "top_p": top_p, # Nucleus sampling parameter + "max_tokens": max_tokens, # Maximum tokens in the output + "num_samples": num_samples, # Number of output samples + "output_format": output_format, # Desired output format + "question_path": question_path + } + + # Call the pipeline's generate_data method using the configuration dictionary + data = self.pipeline.generate_data(self._tConfig) + + return data diff --git a/build/lib/edg4llm/core/pipeline.py b/build/lib/edg4llm/core/pipeline.py new file mode 100644 index 0000000..d8ba770 --- /dev/null +++ b/build/lib/edg4llm/core/pipeline.py @@ -0,0 +1,88 @@ +import os +from typing import Any, Tuple, Dict + +from edg4llm.utils.logger import custom_logger +from edg4llm.core.dataGenerators import DataGenerator + +logger = custom_logger("DataPipeline") + +class DataPipeline: + """ + The DataPipeline class manages the entire process of generating data, designed to + automatically create fine-tuning data for different task types such as question + generation, answer generation, and dialogue generation. + + This class uses a DataGenerator object to handle the core logic of data generation + and dynamically executes the corresponding task based on the provided configuration + parameters. It provides a unified interface for users to easily invoke specific + data generation methods with minimal configuration. + + Attributes: + ---------- + data_generator (DataGenerator): An object that handles the specific data generation tasks. + + Methods: + ---------- + __init__(pConfig): Initializes the DataPipeline class and creates a DataGenerator + object based on the configuration. + generate_data(tConfig): Generates fine-tuning data based on the task configuration. + Supported task types include question generation, answer generation, + and dialogue generation. + """ + + def __init__(self, pConfig): + """ + Initializes the data generation process. + + Parameters + ---------- + pConfig : dict + Configuration for initializing the DataGenerator. Expected to contain: + - model_provider: str + The type of language model to use, by default "chatglm". + - model_name: str + The specific model to use within the model type, by default "chatglm-4-flash". + - base_url : str + The base URL of the LLM API. + - api_key : str + The API key for authentication. + """ + + self.data_generator = DataGenerator(pConfig) + + def generate_data(self, tConfig) -> Dict: + """ + Generates data based on the provided configuration. + + Parameters + ---------- + tConfig : Dict + Task configuration containing the following keys: + - task_type : str + Specifies the type of task ('question', 'answer', or 'dialogue'). + - Other parameters required for data generation, specific to the task type. + + Returns + ------- + dict + A dictionary containing the generated fine-tuning data. + + Raises + ------ + ValueError + If the provided task type is unsupported. + """ + if tConfig["task_type"] == "question": + logger.info("Generated data for task_type: 'question'") + data = self.data_generator.generate_question(tConfig) + elif tConfig["task_type"] == "answer": + logger.info("Generated data for task_type: 'answer'") + data = self.data_generator.generate_answer(tConfig) + elif tConfig["task_type"] == "dialogue": + logger.info("Generated data for task_type: 'dialogue'") + data = self.data_generator.generate_dialogue(tConfig) + else: + logger.error("Unsupported task type: %s", tConfig["task_type"]) + raise ValueError("Unsupported task type") + + return data diff --git a/build/lib/edg4llm/generators/__init__.py b/build/lib/edg4llm/generators/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/build/lib/edg4llm/generators/text_generators/__init__.py b/build/lib/edg4llm/generators/text_generators/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/build/lib/edg4llm/generators/text_generators/answer_generator.py b/build/lib/edg4llm/generators/text_generators/answer_generator.py new file mode 100644 index 0000000..a66d0c2 --- /dev/null +++ b/build/lib/edg4llm/generators/text_generators/answer_generator.py @@ -0,0 +1,191 @@ +import os +import sys +import json +from typing import Dict, Any + +from edg4llm.utils.logger import custom_logger +from edg4llm.generators.text_generators.base_generator import BaseGenerator + +logger = custom_logger("AnswerGenerator") + +class AnswerGenerator(BaseGenerator): + """ + A class for generating answers based on user queries using a specified model. + + This class extends the `BaseGenerator` class and provides functionality to generate + answers to user queries based on a given configuration. It interacts with the model's + `execute_request` method to generate responses based on system-level and user-level prompts. + It supports customization through parameters such as temperature, sampling strategies, + and token limits. + + Attributes + ---------- + model : object + The model interface used for generating answers. + + Methods + ------- + generate(tConfig: dict) -> list of dict: + Generates answers based on the provided configuration. + + Notes + ----- + - The `generate` method ensures valid answers are returned, retrying if necessary. + - It logs progress for each generated answer. + """ + + def __init__(self, model): + """ + Initialize the AnswerGenerator. + + Parameters + ---------- + model : object + The model interface used for generating answers. + """ + + super().__init__(model) + + def generate(self, tConfig) -> str: + """ + Generate answers based on the provided configuration. + + This method generates one or more answers based on the parameters provided in + the `tConfig` dictionary. It uses the model's `execute_request` method to generate + answers based on the system and user prompts, with options to control randomness, + output length, and sampling strategy. + + Parameters + ---------- + tConfig : dict + A configuration dictionary containing the following key-value pairs: + - "system_prompt" : str, optional + A system-level prompt that provides context for generating the answer. Default is an empty string. + - "user_prompt" : str + A user-provided prompt (query) to generate the corresponding answer. + - "model" : str, optional + The specific model to use for answer generation. Default is "glm-4-flash". + - "do_sample" : bool, optional + Whether to use sampling strategies during answer generation. Default is True. + - "temperature" : float, optional + A sampling parameter to control the randomness of the output. Must be between 0.0 and 1.0. Default is 0.95. + - "top_p" : float, optional + Nucleus sampling parameter controlling the cumulative probability range for token selection. + Must be between 0.0 and 1.0. Default is 0.7. + - "max_tokens" : int, optional + The maximum number of tokens to generate in the answer. Default is 4095. + - "num_samples" : int, optional + The number of answers to generate. Default is 1. + + Returns + ------- + list of dict + A list of dictionaries containing the generated answers. Each dictionary + includes the generated answer content and relevant metadata. + + Notes + ----- + - The method will retry generating answers if the model fails to provide a valid response. + - Progress and debug information are logged for each generated answer. + """ + + # Extract configuration parameters + system_prompt = tConfig.get("system_prompt", "") + user_prompt = tConfig.get("user_prompt", "") + do_sample = tConfig.get("do_sample", True) + temperature = tConfig.get("temperature", 0.95) + top_p = tConfig.get("top_p", 0.7) + max_tokens = tConfig.get("max_tokens", 4095) + num_samples = tConfig.get("num_samples", 1) # Default is to generate 1 sample + question_path = tConfig.get("question_path", None) + + try: + with open(question_path, "r", encoding="utf-8") as file: + data = json.load(file) + + if isinstance(data, dict): # If it's a single dictionary, wrap it in a list + data = [data] + elif not isinstance(data, list): # Ensure it's a list of dictionaries + raise ValueError("Invalid JSON structure. Expected a list or a dictionary.") + + # Extract questions + questions = [item["question"] for item in data if "question" in item] + except FileNotFoundError: + logger.error("The file at path %s was not found.", question_path) + return None + except json.JSONDecodeError as e: + logger.error("Error decoding JSON from file %s: %s", question_path, str(e)) + return None + except Exception as e: + logger.error("Unexpected error: %s", str(e)) + return None + + if len(questions) != num_samples: + logger.error( + "The number of questions (%d) does not match the expected number (%d). Please check your input.", + len(questions), + num_samples, + ) + + sys.exit(1) # 非零退出码表示异常终止 + + # List to store the generated dialogues + dialogues = [] + + # Generate dialogues for the specified number of samples + total_samples = num_samples # Total number of samples to generate + logger.info("Starting the data generation process.") + for _idx, question in enumerate(questions): + retry_count = 0 # 初始化重试计数 + max_retries = 5 # 设置最大重试次数(根据需要调整) + + while True: # Keep trying until valid dialogue data is generated + retry_count += 1 + + generated_answer = self.model.execute_request( + system_prompt=system_prompt, + user_prompt=user_prompt.replace("EDG4LLM", question), + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + max_tokens=max_tokens, + ) + + if "error" in generated_answer: + logger.warning( + "Sample %d: Request failed with error: %s. Retrying (%d/%d)...", + _idx + 1, + generated_answer["error"], + retry_count, + max_retries, + ) + + if retry_count >= max_retries: + logger.error("Sample %d: Max retries reached. Skipping this sample.", _idx + 1) + break # 跳出当前样本,进入下一个 + continue # 继续当前样本的生成 + + # Convert the generated dialogue to the desired format (e.g., Alpaca format) + converted_generated_answer = self._convert_original_to_alpaca_answer(system_prompt, question, generated_answer) + + if converted_generated_answer is not None: + # If the dialogue is valid, append it to the results and break the loop + dialogues.append(converted_generated_answer) + break + else: + logger.warning( + "Sample %d: Generated answer is None. Retrying (%d/%d)...", + _idx + 1, + retry_count, + max_retries, + ) + + if retry_count >= max_retries: + logger.error("Sample %d: Max retries reached. Skipping this sample.", _idx + 1) + break # 跳出当前样本 + + # Log the progress of dialogue generation + progress = ((_idx+1) / total_samples) * 100 + logger.info("Data generation progress: %.2f%% (%d/%d samples completed)", progress, _idx+1, total_samples) + + return dialogues diff --git a/build/lib/edg4llm/generators/text_generators/base_generator.py b/build/lib/edg4llm/generators/text_generators/base_generator.py new file mode 100644 index 0000000..a857635 --- /dev/null +++ b/build/lib/edg4llm/generators/text_generators/base_generator.py @@ -0,0 +1,131 @@ +import os +from abc import ABC, abstractmethod +from typing import Dict + +from edg4llm.processor.postprocess import PostProcessor +class BaseGenerator(ABC): + """ + Base class for all data generators, defining a common interface for generating data. + + This class serves as a foundation for different types of data generators, providing common functionality + such as interaction with a model and post-processing of generated data. Specific generators should extend + this class and implement their own `generate` method. + + Attributes + ---------- + model : object + The model interface used for generating data. + postprocessor : PostProcessor + An instance of the PostProcessor class for handling post-processing of generated data. + + Methods + ------- + generate(prompt: str) -> str + Abstract method to generate data based on a prompt. Must be implemented by subclasses. + + """ + def __init__(self, model): + """ + Initialize the generator. + + Parameters + ---------- + model : object + The model interface used for generating data. + """ + + self.model = model + self.postprocessor = PostProcessor() + + @abstractmethod + def generate(self, prompt: str) -> str: + """ + Convert original data into Alpaca format. + + This method uses the PostProcessor to process conversation data and structure it + in a format suitable for Alpaca-based models. + + Parameters + ---------- + system_prompt : str + The system-level prompt for context in the Alpaca format. + single_data : str + The raw conversation data to be processed. + + Returns + ------- + dict + The conversation data converted to Alpaca format. + """ + pass + + def _convert_original_to_alpaca(self, system_prompt, single_data): + """ + Convert original data into Alpaca format. + + This method uses the PostProcessor to process conversation data and structure it + in a format suitable for Alpaca-based models. + + Parameters + ---------- + system_prompt : str + The system-level prompt for context in the Alpaca format. + single_data : str + The raw conversation data to be processed. + + Returns + ------- + dict + The conversation data converted to Alpaca format. + """ + + converted_data = self.postprocessor.dialogue_postprocessing(conversation_data=single_data, system_prompt=system_prompt) + + return converted_data + + def _convert_original_to_json(self, single_data): + """ + Convert original data into JSON format. + + This method uses the PostProcessor to process raw data into a JSON-compatible structure. + + Parameters + ---------- + single_data : str + The raw question data to be processed. + + Returns + ------- + dict + The data converted into JSON format. + """ + + converted_data = self.postprocessor.question_postprocessing(question_data=single_data) + + return converted_data + + def _convert_original_to_alpaca_answer(self, system_prompt, question, single_data): + """ + Convert original data into Alpaca answer format. + + This method uses the PostProcessor to process raw data into an answer format suitable for Alpaca-based models. + + Parameters + ---------- + system_prompt : str + The system-level prompt for context in the Alpaca format. + question : str + The question text for which the answer is generated. + single_data : str + The raw answer data to be processed. + + Returns + ------- + dict + The data converted into Alpaca format. + """ + + converted_data = self.postprocessor.answer_postprocessing(question=question, answer=single_data, system_prompt=system_prompt) + + return converted_data + \ No newline at end of file diff --git a/build/lib/edg4llm/generators/text_generators/dialogue_generator.py b/build/lib/edg4llm/generators/text_generators/dialogue_generator.py new file mode 100644 index 0000000..e1a9e71 --- /dev/null +++ b/build/lib/edg4llm/generators/text_generators/dialogue_generator.py @@ -0,0 +1,159 @@ +import os +from typing import Dict, List, Any + +from edg4llm.utils.logger import custom_logger +from edg4llm.generators.text_generators.base_generator import BaseGenerator + +logger = custom_logger("DialogueGenerator") + +class DialogueGenerator(BaseGenerator): + """ + Dialogue Generator class for generating dialogues using a specified model. + + This class extends the `BaseGenerator` and utilizes the given model to generate dialogues + based on user input and system prompts. It provides flexibility to control generation parameters + like sampling strategies, temperature, and output format. + + Parameters + ---------- + model : object + The model interface used for generating dialogues. This model must have the + `execute_request` method for generating dialogue based on the given parameters. + """ + + def __init__(self, model): + """ + Initialize the Dialogue Generator. + + This constructor initializes the `DialogueGenerator` by calling the base class constructor + with the provided model. It sets up the necessary components for generating dialogues. + + Parameters + ---------- + model : object + The model interface to be used for generating dialogues. It should provide + the `execute_request` method to generate data based on the parameters. + + Notes + ----- + The `model` should be capable of handling inputs like system prompts, user prompts, + and additional parameters for controlling the text generation process. + """ + super().__init__(model) + + def generate(self, tConfig) -> List: + """ + Generate dialogues based on the provided configuration. + + This method generates one or more dialogues based on the parameters provided in + the `tConfig` dictionary. The method interacts with the model's `execute_request` + function to generate dialogue based on the system and user prompts. It also supports + various options for controlling randomness, output length, and sampling strategy. + + Parameters + ---------- + tConfig : dict + A configuration dictionary containing the following key-value pairs: + - "system_prompt" : str, optional + A system-level prompt that guides the dialogue generation. Default is an empty string. + - "user_prompt" : str, optional + A user-provided prompt to initiate the dialogue generation. Default is an empty string. + - "model" : str, optional + The specific model to use for generation. Default is "glm-4-flash". + - "do_sample" : bool, optional + Whether to use sampling strategies during text generation. Default is True. + - "temperature" : float, optional + A sampling parameter to control the randomness of output. Must be between 0.0 and 1.0. Default is 0.95. + - "top_p" : float, optional + Nucleus sampling parameter controlling the cumulative probability range for token selection. + Must be between 0.0 and 1.0. Default is 0.7. + - "max_tokens" : int, optional + The maximum number of tokens to generate. Default is 4095. + - "num_samples" : int, optional + The number of dialogue samples to generate. Default is 1. + + Returns + ------- + list of dict + A list of dictionaries containing the generated dialogues. Each dictionary + includes the generated dialogue content. + + Notes + ----- + - The method will attempt to generate dialogues until a valid response is generated. + If the generated dialogue is `None`, it will retry. + - Progress is logged for each sample generated. + """ + + # Extract configuration parameters + system_prompt = tConfig.get("system_prompt", "") + user_prompt = tConfig.get("user_prompt", "") + do_sample = tConfig.get("do_sample", True) + temperature = tConfig.get("temperature", 0.95) + top_p = tConfig.get("top_p", 0.7) + max_tokens = tConfig.get("max_tokens", 4095) + num_samples = tConfig.get("num_samples", 1) # Default is to generate 1 sample + + # List to store the generated dialogues + dialogues = [] + + # Generate dialogues for the specified number of samples + total_samples = num_samples # Total number of samples to generate + logger.info("Starting the data generation process.") + for _idx in range(1, num_samples + 1): + retry_count = 0 # 初始化重试计数 + max_retries = 5 # 设置最大重试次数(根据需要调整) + + while True: # Keep trying until valid dialogue data is generated + retry_count += 1 + + generated_dialogue = self.model.execute_request( + system_prompt=system_prompt, + user_prompt=user_prompt, + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + max_tokens=max_tokens, + ) + + if "error" in generated_dialogue: + logger.warning( + "Sample %d: Request failed with error: %s. Retrying (%d/%d)...", + _idx, + generated_dialogue["error"], + retry_count, + max_retries, + ) + + if retry_count >= max_retries: + logger.error("Sample %d: Max retries reached. Skipping this sample.", _idx) + break # 跳出当前样本,进入下一个 + + continue # 继续当前样本的生成 + + + # Convert the generated dialogue to the desired format (e.g., Alpaca format) + converted_generated_dialogue = self._convert_original_to_alpaca(system_prompt, generated_dialogue) + + if converted_generated_dialogue is not None: + # If the dialogue is valid, append it to the results and break the loop + dialogues.append(converted_generated_dialogue) + break + else: + logger.warning( + "Sample %d: Generated dialogue is None. Retrying (%d/%d)...", + _idx, + retry_count, + max_retries, + ) + + if retry_count >= max_retries: + logger.error("Sample %d: Max retries reached. Skipping this sample.", _idx) + break # 跳出当前样本 + + + # Log the progress of dialogue generation + progress = (_idx / total_samples) * 100 + logger.info("Data generation progress: %.2f%% (%d/%d samples completed)", progress, _idx, total_samples) + + return dialogues diff --git a/build/lib/edg4llm/generators/text_generators/question_generator.py b/build/lib/edg4llm/generators/text_generators/question_generator.py new file mode 100644 index 0000000..3a4d99e --- /dev/null +++ b/build/lib/edg4llm/generators/text_generators/question_generator.py @@ -0,0 +1,151 @@ +import os +from typing import Dict, List, Any +from edg4llm.utils.logger import custom_logger +from edg4llm.generators.text_generators.base_generator import BaseGenerator + +logger = custom_logger("QuestionGenerator") + +class QuestionGenerator(BaseGenerator): + """ + A class for generating questions based on user prompts and configuration. + + This class extends the `BaseGenerator` class and provides functionality to generate + questions using a specified model. It interacts with the model's `execute_request` + method to create output based on user-defined parameters such as sampling strategies, + temperature, and maximum tokens. + + Attributes + ---------- + model : object + The model interface used for generating questions. + + Methods + ------- + generate(tConfig: dict) -> list of dict: + Generates questions based on the provided configuration. + + Notes + ----- + - The `generate` method ensures valid responses are returned, retrying if necessary. + - Logs progress for each generated question. + """ + + def __init__(self, model): + """ + Initialize the QuestionGenerator. + + Parameters + ---------- + model : object + The model interface used for generating questions. + """ + + super().__init__(model) + + def generate(self, tConfig: Dict) -> List: + """ + Generate questions based on the provided configuration. + + This method generates one or more questions using the parameters specified + in the `tConfig` dictionary. It interacts with the model's `execute_request` + method to generate output based on user prompts and various sampling options. + + Parameters + ---------- + tConfig : dict + A dictionary containing configuration options for question generation: + - "system_prompt" : str, optional + A system-level instruction to guide the question generation. Default is an empty string. + - "user_prompt" : str, optional + A user-provided input to guide the question generation. Default is an empty string. + - "model" : str, optional + Specifies the model for text generation. Default is "glm-4-flash". + - "do_sample" : bool, optional + Whether to use sampling during generation. Default is True. + - "temperature" : float, optional + Controls randomness in output. Value should be between 0.0 and 1.0. Default is 0.95. + - "top_p" : float, optional + Nucleus sampling parameter to limit token selection to a cumulative probability. Default is 0.7. + - "max_tokens" : int, optional + The maximum number of tokens for the output. Default is 4095. + - "num_samples" : int, optional + The number of question samples to generate. Default is 1. + + Returns + ------- + list of dict + A list of dictionaries containing the generated questions. + + Notes + ----- + - The method retries generation until a valid response is obtained. + - Logs progress for each generated sample. + """ + + # Extract parameters from the configuration + system_prompt = tConfig.get("system_prompt", "") + user_prompt = tConfig.get("user_prompt", "") + do_sample = tConfig.get("do_sample", True) + temperature = tConfig.get("temperature", 0.95) + top_p = tConfig.get("top_p", 0.7) + max_tokens = tConfig.get("max_tokens", 4095) + num_samples = tConfig.get("num_samples", 1) + + # Initialize a list to store generated questions + questions = [] + cur_len = 0 + # Generate questions for the specified number of samples + logger.info("Starting the data generation process.") + for _idx in range(1, num_samples + 1): + retry_count = 0 # 初始化重试计数 + max_retries = 5 # 设置最大重试次数(根据需要调整) + + while True: # Retry until a valid question is generated + retry_count += 1 + + generated_question = self.model.execute_request( + system_prompt=system_prompt, + user_prompt=user_prompt, + do_sample=do_sample, + temperature=temperature, + top_p=top_p, + max_tokens=max_tokens, + ) + + if "error" in generated_question: + logger.warning( + "Sample %d: Request failed with error: %s. Retrying (%d/%d)...", + _idx, + generated_question["error"], + retry_count, + max_retries, + ) + + if (retry_count >= max_retries): + logger.error("Sample %d: Max retries reached. Skipping this sample.", _idx) + break # 跳出当前样本 + + # Convert the raw output to a specific format + converted_question = self._convert_original_to_json(generated_question) + + if converted_question is not None: + cur_len = len(converted_question) + questions.extend(converted_question) + break + else: + logger.warning( + "Sample %d: Generated dialogue is None. Retrying (%d/%d)...", + _idx, + retry_count, + max_retries, + ) + + if retry_count >= max_retries: + logger.error("Sample %d: Max retries reached. Skipping this sample.", _idx) + break # 跳出当前样本 + + # Log progress for tracking generation completion + progress = (_idx / num_samples) * 100 + logger.info("Generation progress: %.2f%% (%d samples generated, %d/%d epoch completed)", progress, cur_len, _idx, num_samples) + + return questions diff --git a/build/lib/edg4llm/models/__init__.py b/build/lib/edg4llm/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/build/lib/edg4llm/models/baseModel.py b/build/lib/edg4llm/models/baseModel.py new file mode 100644 index 0000000..d3c62b8 --- /dev/null +++ b/build/lib/edg4llm/models/baseModel.py @@ -0,0 +1,126 @@ +""" +Module for defining the base class of EDG models. + +This file contains the abstract base class `EDGBaseModel`, which serves as a foundation for implementing various +machine learning models. The class defines key methods that must be implemented by any derived model class +to handle requests, send HTTP requests, and interact with APIs. + +Classes +------- +EDGBaseModel(ABC) + Abstract base class for EDG models, providing a standard structure for derived model implementations. + +Methods +------- +__init__(api_key: str = None, base_url: str = None, model_name: str = None) + Initializes the base model with API key, base URL, and model name. + +execute_request(system_prompt: str, user_prompt: str, **kwargs) -> str + Abstract method to process user input and generate model responses. + Must be implemented by derived classes. + +send_request(request: Dict[str, Any]) -> Dict[str, Any] + Abstract method to send HTTP requests and handle server interactions. + Must be implemented by derived classes. +""" + +import requests +from abc import ABC, abstractmethod +from typing import Any, Dict + +from edg4llm.utils.logger import custom_logger + +logger = custom_logger('baseModel') + + +class EDGBaseModel(ABC): + """ + Abstract base class for EDG models. + + This class defines the blueprint for machine learning model implementations. Derived classes must + implement methods to process user prompts, interact with APIs, and handle HTTP requests. + + Attributes + ---------- + api_key : str + The API key required for authenticating requests. + + base_url : str + The base URL of the model API endpoint. + + model_name : str + The name of the model, used to differentiate between various models. + """ + + def __init__(self, api_key: str = None, base_url: str = None, model_name: str = None): + """ + Initializes the base model with API key, base URL, and model name. + + Parameters + ---------- + api_key : str, optional + The API key for authenticating requests. Default is None. + + base_url : str, optional + The base URL of the model API endpoint. Default is None. + + model_name : str, optional + The name of the model, used for identifying different models. Default is None. + """ + self.api_key = api_key + self.base_url = base_url + self.model_name = model_name + + @abstractmethod + def execute_request(self, system_prompt: str, user_prompt: str, **kwargs) -> str: + """ + Abstract method to process and execute a request. + + This method must be implemented by derived classes. It processes user input and generates + responses based on a system prompt and additional parameters. + + Parameters + ---------- + system_prompt : str + The system-level instruction or prompt defining the role or behavior of the model. + + user_prompt : str + The user's input or query for the model. + + kwargs : dict + Additional parameters for processing the request. + + Returns + ------- + str + The response generated by the model. + + Notes + ----- + - Derived classes should implement this method to handle the specific logic for generating responses. + """ + pass + + @abstractmethod + def send_request(self, request: Dict[str, Any]) -> Dict[str, Any]: + """ + Abstract method to send HTTP requests. + + This method must be implemented by derived classes to handle API interactions and perform + error handling for HTTP requests. + + Parameters + ---------- + request : dict + A dictionary containing all necessary information for the HTTP request. + + Returns + ------- + dict + The server's response as a dictionary. + + Notes + ----- + - Derived classes should implement this method to handle API-specific logic and error handling. + """ + pass diff --git a/build/lib/edg4llm/models/chatglm.py b/build/lib/edg4llm/models/chatglm.py new file mode 100644 index 0000000..5c99629 --- /dev/null +++ b/build/lib/edg4llm/models/chatglm.py @@ -0,0 +1,273 @@ +import os +import requests +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union, cast + +from edg4llm.utils.logger import custom_logger +from edg4llm.models.baseModel import EDGBaseModel +from edg4llm.utils.exceptions import HttpClientError, InvalidPromptError + +logger = custom_logger('chatglm') + +class EDGChatGLM(EDGBaseModel): + """ + EDGChatGLM interface for interacting with the ChatGLM model to generate text based on given prompts. + + This class provides an interface to interact with the ChatGLM model for generating text + based on a system and user prompt. It supports customizable parameters such as temperature, + sampling strategies, and model selection. It also handles HTTP requests and error management. + + Parameters + ---------- + base_url : str, optional + The base URL for the ChatGLM API. If not provided, defaults to None. + api_key : str, optional + The API key for authenticating with the ChatGLM API. If not provided, defaults to None. + """ + + def __init__(self, base_url: str = None, api_key: str = None, model_name: str = 'glm-4-flash'): + """ + Initialize the ChatGLM model interface. + + This constructor initializes the `EDGChatGLM` class by calling the base class constructor + and passing the API key, base URL, and model name ("ChatGLM"). It sets up the necessary + configuration for interacting with the ChatGLM API. + + Parameters + ---------- + base_url : str, optional + The base URL for the ChatGLM API. Default is None. + api_key : str, optional + The API key for authenticating with the ChatGLM API. Default is None. + model_name: str, optional + The specific model to use within the selected provider. Default is "glm-4-flash". + Notes + ----- + The base URL and API key are required for successful communication with the ChatGLM API. + """ + super().__init__(api_key, base_url, model_name=model_name) + + def execute_request( + self, + system_prompt: str = None, + user_prompt: str = None, + do_sample: bool = True, + temperature: float = 0.95, + top_p: float = 0.7, + max_tokens: int = 4095 + ) -> str: + """ + Generate text using the ChatGLM model based on the provided prompts and parameters. + + This method calls the internal request execution function and handles the text + generation process using the specified system and user prompts. It allows controlling + text generation via parameters such as temperature, sampling strategy, and token limits. + + Parameters + ---------- + system_prompt : str, optional + The system-level prompt that sets the context for the conversation. Default is None. + user_prompt : str, optional + The user-provided prompt that initiates the conversation. Default is None. + do_sample : bool, optional + Whether to use sampling during text generation. Default is True. + temperature : float, optional + Sampling temperature to control randomness. Default is 0.95. + top_p : float, optional + Nucleus sampling parameter for controlling randomness. Default is 0.7. + max_tokens : int, optional + The maximum number of tokens to generate in the output. Default is 4095. + + Returns + ------- + str + The generated text content from the model. + + Raises + ------ + InvalidPromptError + If both the system and user prompts are None. + """ + response = self._execute_request(system_prompt, user_prompt, self.model_name, do_sample, temperature, top_p, max_tokens) + return response + + def send_request(self, request: Dict[str, Any]) -> Dict[str, Any]: + """ + Send an HTTP request to the ChatGLM API. + + This method sends a POST request to the ChatGLM API with the provided request data. + It returns the response data as a dictionary. + + Parameters + ---------- + request : dict + A dictionary containing the request data, including the URL, headers, and JSON body. + + Returns + ------- + dict + The response from the API in the form of a dictionary. + + Raises + ------ + HttpClientError + If any error occurs during the HTTP request process. + """ + response = self._send_request(request=request) + return response + + def _send_request(self, request: Dict[str, Any]) -> Dict[str, Any]: + """ + Internal method to send a POST request to the ChatGLM API. + + This method handles the actual HTTP POST request to the ChatGLM API. It includes + error handling for HTTP errors, connection issues, timeouts, and JSON decoding. + + Parameters + ---------- + request : dict + A dictionary containing the request data, including the URL, headers, and JSON body. + + Returns + ------- + dict + The JSON response from the API. + + Raises + ------ + HttpClientError + If an error occurs during the request. + """ + url = request.get("url", "https://open.bigmodel.cn/api/paas/v4/chat/completions") + headers = {**request.get("headers", {})} + json = request.get("json", {}) + try: + response = requests.post( + url=url, + headers=headers, + json=json, + timeout=30, + ) + response.raise_for_status() + return response.json()["choices"][0]["message"]["content"].strip() + + except requests.exceptions.HTTPError as e: + # Handle HTTP error exceptions + status_code = e.response.status_code + logger.error( + "HTTP error occurred. Status Code: %s, URL: %s, Message: %s", + status_code, + url, + e, + ) + + return {"error": "HTTP error", "status_code": status_code, "message": str(e)} + + + except requests.exceptions.ConnectionError as e: + # Handle connection errors + logger.error("Connection error occurred while connecting to %s: %s", url, e) + + return {"error": "Connection error", "message": str(e)} + + except requests.exceptions.Timeout as e: + # Handle timeout errors + logger.error("Timeout occurred while sending request to %s: %s", url, e) + + return {"error": "Timeout", "message": str(e)} + + + except requests.exceptions.RequestException as e: + # Handle any generic request exceptions + logger.error( + "Request exception occurred while sending request to %s: %s", url, e + ) + + return {"error": "Request exception", "message": str(e)} + + + except ValueError as e: + # Handle JSON decoding errors + logger.error("JSON decoding error occurred: %s", e) + + return {"error": "JSON decoding error", "message": str(e)} + + except Exception as e: + # Catch any unexpected errors + logger.critical( + "An unexpected error occurred while sending request to %s: %s", url, e + ) + + return {"error": "Unexpected error", "message": str(e)} + + def _execute_request( + self, + system_prompt: str = None, + user_prompt: str = None, + model: str = "glm-4-flash", + do_sample: bool = True, + temperature: float = 0.95, + top_p: float = 0.7, + max_tokens: int = 4095 + ) -> str: + """ + Internal method to prepare the request data and execute the request for text generation. + + This method prepares the necessary data (including headers, JSON body) for the + ChatGLM API request and then calls the `send_request` method to send the request + and return the response. + + Parameters + ---------- + system_prompt : str, optional + The system-level prompt that provides context for the dialogue generation. + Default is None. + user_prompt : str, optional + The user-provided prompt that initiates the generation. + Default is None. + model : str, optional + The model to use for the generation. Default is "glm-4-flash". + do_sample : bool, optional + Whether to use sampling during text generation. Default is True. + temperature : float, optional + Sampling temperature to control randomness. Default is 0.95. + top_p : float, optional + Nucleus sampling parameter for controlling randomness. Default is 0.7. + max_tokens : int, optional + The maximum number of tokens to generate. Default is 4095. + + Returns + ------- + str + The generated text content from the model. + + Raises + ------ + InvalidPromptError + If both the system and user prompts are None. + """ + if (system_prompt is None and user_prompt is None): + logger.error("Both prompts cannot be empty") + raise InvalidPromptError("Both prompts cannot be empty") + + request_data = { + "url": f"{self.base_url}", + "headers": { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + }, + "json": { + "model": model, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt}, + ], + "do_sample": do_sample, + "temperature": temperature, + "top_p": top_p, + "max_tokens": max_tokens, + }, + } + + response = self.send_request(request_data) + + return response diff --git a/build/lib/edg4llm/models/chatgpt.py b/build/lib/edg4llm/models/chatgpt.py new file mode 100644 index 0000000..6b7ad18 --- /dev/null +++ b/build/lib/edg4llm/models/chatgpt.py @@ -0,0 +1,286 @@ +import os +import requests +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union, cast + +from edg4llm.utils.logger import custom_logger +from edg4llm.models.baseModel import EDGBaseModel +from edg4llm.utils.exceptions import HttpClientError, InvalidPromptError + +logger = custom_logger('chatgpt') + +class EDGChatGPT(EDGBaseModel): + """ + A class to interface with the ChatGPT model for text generation. + + This class extends the `EDGBaseModel` abstract base class to implement a specific interface + for interacting with the ChatGPT API. It supports text generation using system-level and + user-level prompts with customizable parameters such as temperature, sampling strategies, + and token limits. The class also includes methods to handle HTTP requests and manage errors. + + Attributes + ---------- + base_url : str + The base URL for the ChatGPT API endpoint. + api_key : str + The API key for authenticating with the ChatGPT API. + model_name : str + The specific model to use, defaulting to "gpt-4o-mini". + + Methods + ------- + execute_request(system_prompt: str, user_prompt: str, do_sample: bool, temperature: float, top_p: float, max_tokens: int) -> str: + Generates text using the ChatGPT model based on the provided prompts and parameters. + + send_request(request: Dict[str, Any]) -> Dict[str, Any]: + Sends an HTTP POST request to the ChatGPT API and returns the response as a dictionary. + + Notes + ----- + - The `base_url` and `api_key` are required for proper communication with the ChatGPT API. + - Provides detailed error handling for HTTP, connection, timeout, and JSON decoding issues. + - Supports customizable text generation parameters for flexibility in model behavior. + """ + + def __init__(self, base_url:str = None, api_key: str = None, model_name: str = "gpt-4o-mini"): + """ + Initialize the ChatGPT model interface. + + Parameters + ---------- + base_url : str, optional + The base URL for the ChatGPT API. Default is None. + api_key : str, optional + The API key for authenticating with the ChatGPT API. Default is None. + model_name : str, optional + The specific model to use, defaulting to "gpt-4o-mini". + """ + + super().__init__(api_key, base_url, model_name=model_name) + + def execute_request( + self + , system_prompt: str = None + , user_prompt: str = None + , do_sample: bool = True + , temperature: float = 0.95 + , top_p: float = 0.7 + , max_tokens: int = 4095 + ) -> str: + + """ + Generate text using the ChatGPT model based on the provided prompts and parameters. + + Parameters + ---------- + system_prompt : str, optional + The system-level prompt providing context for the text generation. Default is None. + user_prompt : str, optional + The user-provided prompt initiating the text generation. Default is None. + do_sample : bool, optional + Whether to use sampling during text generation. Default is True. + temperature : float, optional + Sampling temperature to control randomness. Default is 0.95. + top_p : float, optional + Nucleus sampling parameter to control randomness. Default is 0.7. + max_tokens : int, optional + The maximum number of tokens to generate. Default is 4095. + + Returns + ------- + str + The generated text content from the model. + + Raises + ------ + InvalidPromptError + If both system and user prompts are None. + """ + + response = self._execute_request(system_prompt, user_prompt, self.model_name, do_sample, temperature, top_p, max_tokens) + return response + + def send_request(self, request: Dict[str, Any]) -> Dict[str, Any]: + + """ + Send an HTTP request to the ChatGPT API. + + Parameters + ---------- + request : dict + A dictionary containing the request data, including the URL, headers, and JSON body. + + Returns + ------- + dict + The response from the API in the form of a dictionary. + + Raises + ------ + HttpClientError + If any error occurs during the HTTP request process. + """ + + response = self._send_request(request=request) + return response + + def _send_request(self, request: Dict[str, Any]) -> Dict[str, Any]: + + """ + Internal method to send an HTTP POST request to the ChatGPT API. + + This method handles the actual HTTP POST request and manages error handling + for issues like connection failures, timeouts, and JSON decoding errors. + + Parameters + ---------- + request : dict + A dictionary containing the request data, including the URL, headers, and JSON body. + + Returns + ------- + dict + The JSON response from the API. + + Raises + ------ + HttpClientError + If an error occurs during the HTTP request. + """ + + url = request.get("url", "https://api.openai.com/v1/chat/completions") + headers = {**request.get("headers", {})} + json = request.get("json", {}) + try: + response = requests.post( + url=url, + headers=headers, + json=json, + timeout=30, + ) + + response.raise_for_status() + + return response.json()["choices"][0]["message"]["content"].strip() + + except requests.exceptions.HTTPError as e: + # Handle HTTP error exceptions + status_code = e.response.status_code + logger.error( + "HTTP error occurred. Status Code: %s, URL: %s, Message: %s", + status_code, + url, + e, + ) + + return {"error": "HTTP error", "status_code": status_code, "message": str(e)} + + + except requests.exceptions.ConnectionError as e: + # Handle connection errors + logger.error("Connection error occurred while connecting to %s: %s", url, e) + + return {"error": "Connection error", "message": str(e)} + + except requests.exceptions.Timeout as e: + # Handle timeout errors + logger.error("Timeout occurred while sending request to %s: %s", url, e) + + return {"error": "Timeout", "message": str(e)} + + + except requests.exceptions.RequestException as e: + # Handle any generic request exceptions + logger.error( + "Request exception occurred while sending request to %s: %s", url, e + ) + + return {"error": "Request exception", "message": str(e)} + + + except ValueError as e: + # Handle JSON decoding errors + logger.error("JSON decoding error occurred: %s", e) + + return {"error": "JSON decoding error", "message": str(e)} + + except Exception as e: + # Catch any unexpected errors + logger.critical( + "An unexpected error occurred while sending request to %s: %s", url, e + ) + + return {"error": "Unexpected error", "message": str(e)} + + + def _execute_request( + self + , system_prompt: str = None + , user_prompt: str = None + , model: str = "gpt-4o-mini" + , do_sample: bool = True + , temperature: float = 0.95 + , top_p: float = 0.7 + , max_tokens: int = 4095 + ) -> str: + + """ + Internal method to prepare and execute the API request for text generation. + + Parameters + ---------- + system_prompt : str, optional + The system-level prompt providing context for the text generation. Default is None. + user_prompt : str, optional + The user-provided prompt initiating the text generation. Default is None. + model : str, optional + The specific model to use for text generation. Default is "gpt-4o-mini". + do_sample : bool, optional + Whether to use sampling during text generation. Default is True. + temperature : float, optional + Sampling temperature to control randomness. Default is 0.95. + top_p : float, optional + Nucleus sampling parameter to control randomness. Default is 0.7. + max_tokens : int, optional + The maximum number of tokens to generate. Default is 4095. + + Returns + ------- + str + The generated text content from the model. + + Raises + ------ + InvalidPromptError + If both system and user prompts are None. + """ + + if (system_prompt is None and user_prompt is None): + logger.error("prompt不能同时为空") + raise InvalidPromptError("prompt不能同时为空") + + request_data = { + "url": f"{self.base_url}", + "headers": { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + }, + "json": { + "model": model, + "messages": [ + { + "role": "developer", + "content": system_prompt, + }, + { + "role": "user", + "content": user_prompt, + } + ], + "temperature": temperature, + "top_p": top_p, + "max_tokens": max_tokens + }, + } + + response = self.send_request(request_data) + return response diff --git a/build/lib/edg4llm/models/deepseek.py b/build/lib/edg4llm/models/deepseek.py new file mode 100644 index 0000000..edd6990 --- /dev/null +++ b/build/lib/edg4llm/models/deepseek.py @@ -0,0 +1,294 @@ +import os +import json +import requests +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union, cast + +from edg4llm.utils.logger import custom_logger +from edg4llm.models.baseModel import EDGBaseModel +from edg4llm.utils.exceptions import HttpClientError, InvalidPromptError + +logger = custom_logger('deepseek') + +class EDGDeepSeek(EDGBaseModel): + """ + A class to interface with the DeepSeek model for text generation. + + This class extends the `EDGBaseModel` abstract base class to implement a specific interface + for interacting with the DeepSeek API. It allows generating text based on system-level and + user-level prompts, with customizable parameters such as temperature, sampling strategies, + and token limits. The class includes methods to handle HTTP requests and manage errors + specific to the DeepSeek API. + + Attributes + ---------- + base_url : str + The base URL for the DeepSeek API endpoint. + api_key : str + The API key for authenticating with the DeepSeek API. + model_name : str + The specific model to use, defaulting to "deepseek-chat". + + Methods + ------- + execute_request(system_prompt: str, user_prompt: str, do_sample: bool, temperature: float, top_p: float, max_tokens: int) -> str: + Generates text using the DeepSeek model based on the provided prompts and parameters. + + send_request(request: Dict[str, Any]) -> Dict[str, Any]: + Sends an HTTP POST request to the DeepSeek API and returns the response as a dictionary. + + Notes + ----- + - The `base_url` and `api_key` are required for proper communication with the DeepSeek API. + - Provides detailed error handling for HTTP, connection, timeout, and JSON decoding issues. + - Supports customizable text generation parameters for flexibility in model behavior. + """ + + def __init__(self, base_url:str = None, api_key: str = None, model_name: str = "deepseek-chat"): + """ + Initialize the DeepSeek model interface. + + Parameters + ---------- + base_url : str, optional + The base URL for the DeepSeek API. Default is None. + api_key : str, optional + The API key for authenticating with the DeepSeek API. Default is None. + model_name : str, optional + The specific model to use, defaulting to "deepseek-chat". + """ + + super().__init__(api_key=api_key, base_url=base_url, model_name = model_name) + + def execute_request( + self + , system_prompt: str = None + , user_prompt: str = None + , do_sample: bool = True + , temperature: float = 0.95 + , top_p: float = 0.7 + , max_tokens: int = 4095 + ) -> str: + """ + Generate text using the DeepSeek model based on the provided prompts and parameters. + + Parameters + ---------- + system_prompt : str, optional + The system-level prompt providing context for the text generation. Default is None. + user_prompt : str, optional + The user-provided prompt initiating the text generation. Default is None. + do_sample : bool, optional + Whether to use sampling during text generation. Default is True. + temperature : float, optional + Sampling temperature to control randomness. Default is 0.95. + top_p : float, optional + Nucleus sampling parameter to control randomness. Default is 0.7. + max_tokens : int, optional + The maximum number of tokens to generate. Default is 4095. + + Returns + ------- + str + The generated text content from the model. + + Raises + ------ + InvalidPromptError + If both system and user prompts are None. + """ + + response = self._execute_request(system_prompt, user_prompt, self.model_name, do_sample, temperature, top_p, max_tokens) + return response + + def send_request(self, request: Dict[str, Any]) -> Dict[str, Any]: + """ + Send an HTTP request to the DeepSeek API. + + Parameters + ---------- + request : dict + A dictionary containing the request data, including the URL, headers, and JSON body. + + Returns + ------- + dict + The response from the API in the form of a dictionary. + + Raises + ------ + HttpClientError + If any error occurs during the HTTP request process. + """ + + response = self._send_request(request=request) + return response + + def _send_request(self, request: Dict[str, Any]) -> Dict[str, Any]: + """ + Internal method to send an HTTP POST request to the DeepSeek API. + + This method handles the actual HTTP POST request and manages error handling + for issues like connection failures, timeouts, and JSON decoding errors. + + Parameters + ---------- + request : dict + A dictionary containing the request data, including the URL, headers, and JSON body. + + Returns + ------- + dict + The JSON response from the API. + + Raises + ------ + HttpClientError + If an error occurs during the HTTP request. + """ + + url = request.get("url", "https://api.deepseek.com/chat/completions") + headers = {**request.get("headers", {})} + data = request.get("data", {}) + + if isinstance(data, dict): + data = json.dumps(data) + + try: + response = requests.request( + "POST", + url=url, + headers=headers, + data=data, + # timeout=30, + ) + + response.raise_for_status() + return response.json()["choices"][0]["message"]["content"].strip() + + except requests.exceptions.HTTPError as e: + # Handle HTTP error exceptions + status_code = e.response.status_code + logger.error( + "HTTP error occurred. Status Code: %s, URL: %s, Message: %s", + status_code, + url, + e, + ) + + return {"error": "HTTP error", "status_code": status_code, "message": str(e)} + + + except requests.exceptions.ConnectionError as e: + # Handle connection errors + logger.error("Connection error occurred while connecting to %s: %s", url, e) + + return {"error": "Connection error", "message": str(e)} + + except requests.exceptions.Timeout as e: + # Handle timeout errors + logger.error("Timeout occurred while sending request to %s: %s", url, e) + + return {"error": "Timeout", "message": str(e)} + + + except requests.exceptions.RequestException as e: + # Handle any generic request exceptions + logger.error( + "Request exception occurred while sending request to %s: %s", url, e + ) + + return {"error": "Request exception", "message": str(e)} + + + except ValueError as e: + # Handle JSON decoding errors + logger.error("JSON decoding error occurred: %s", e) + + return {"error": "JSON decoding error", "message": str(e)} + + except Exception as e: + # Catch any unexpected errors + logger.critical( + "An unexpected error occurred while sending request to %s: %s", url, e + ) + + return {"error": "Unexpected error", "message": str(e)} + + def _execute_request( + self + , system_prompt: str = None + , user_prompt: str = None + , model: str = "deepseek-chat" + , do_sample: bool = True + , temperature: float = 0.95 + , top_p: float = 0.7 + , max_tokens: int = 2047 + ) -> str: + + """ + Internal method to prepare and execute the API request for text generation. + + Parameters + ---------- + system_prompt : str, optional + The system-level prompt providing context for the text generation. Default is None. + user_prompt : str, optional + The user-provided prompt initiating the text generation. Default is None. + model : str, optional + The specific model to use for text generation. Default is "deepseek-chat". + do_sample : bool, optional + Whether to use sampling during text generation. Default is True. + temperature : float, optional + Sampling temperature to control randomness. Default is 0.95. + top_p : float, optional + Nucleus sampling parameter to control randomness. Default is 0.7. + max_tokens : int, optional + The maximum number of tokens to generate. Default is 2047. + + Returns + ------- + str + The generated text content from the model. + + Raises + ------ + InvalidPromptError + If both system and user prompts are None. + """ + + if (system_prompt is None and user_prompt is None): + logger.error("prompt不能同时为空") + raise InvalidPromptError("prompt不能同时为空") + + request_data = { + "url": self.base_url, + "data": { + "messages": [ + {"content": system_prompt, "role": "system"}, + {"content": user_prompt, "role": "user"} + ], + "model": model, + "frequency_penalty": 0, + "max_tokens": max_tokens, + "presence_penalty": 0, + "response_format": {"type": "text"}, + "stop": None, + "stream": False, + "stream_options": None, + "temperature": temperature, + "top_p": top_p, + "tools": None, + "tool_choice": "none", + "logprobs": False, + "top_logprobs": None + }, + "headers": { + 'Content-Type': 'application/json', + 'Accept': 'application/json', + 'Authorization': f'Bearer {self.api_key}' + } + } + + response = self._send_request(request_data) + + return response diff --git a/build/lib/edg4llm/models/internlm.py b/build/lib/edg4llm/models/internlm.py new file mode 100644 index 0000000..6f51807 --- /dev/null +++ b/build/lib/edg4llm/models/internlm.py @@ -0,0 +1,281 @@ +import os +import requests +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union, cast + +from edg4llm.utils.logger import custom_logger +from edg4llm.models.baseModel import EDGBaseModel +from edg4llm.utils.exceptions import HttpClientError, InvalidPromptError + +logger = custom_logger('internlm') + +class EDGInternLM(EDGBaseModel): + """ + A class to interface with the InternLM model for text generation. + + This class extends the `EDGBaseModel` abstract base class to implement a specific interface + for interacting with the InternLM API. It allows generating text based on system-level and + user-level prompts, with customizable parameters such as temperature, sampling strategies, + and token limits. The class includes methods to handle HTTP requests and manage errors + specific to the InternLM API. + + Attributes + ---------- + base_url : str + The base URL for the InternLM API endpoint. + api_key : str + The API key for authenticating with the InternLM API. + model_name : str + The specific model to use, defaulting to "internlm2.5-latest". + + Methods + ------- + execute_request(system_prompt: str, user_prompt: str, model: str, do_sample: bool, temperature: float, top_p: float, max_tokens: int) -> str: + Generates text using the InternLM model based on the provided prompts and parameters. + + send_request(request: Dict[str, Any]) -> Dict[str, Any]: + Sends an HTTP POST request to the InternLM API and returns the response as a dictionary. + + Notes + ----- + - The `base_url` and `api_key` are required for proper communication with the InternLM API. + - Provides detailed error handling for HTTP, connection, timeout, and JSON decoding issues. + - Supports customizable text generation parameters for flexibility in model behavior. + """ + + def __init__(self, base_url:str = None, api_key: str = None, model_name: str = "internlm2.5-latest"): + """ + Initialize the InternLM model interface. + + Parameters + ---------- + base_url : str, optional + The base URL for the InternLM API. Default is None. + api_key : str, optional + The API key for authenticating with the InternLM API. Default is None. + model_name : str, optional + The specific model to use, defaulting to "internlm2.5-latest". + """ + super().__init__(api_key, base_url, model_name=model_name) + + def execute_request( + self + , system_prompt: str = None + , user_prompt: str = None + , model: str = "internlm2.5-latest" + , do_sample: bool = True + , temperature: float = 0.95 + , top_p: float = 0.7 + , max_tokens: int = 4095 + ) -> str: + """ + Generate text using the InternLM model based on the provided prompts and parameters. + + Parameters + ---------- + system_prompt : str, optional + The system-level prompt providing context for the text generation. Default is None. + user_prompt : str, optional + The user-provided prompt initiating the text generation. Default is None. + model : str, optional + The specific model to use for text generation, defaulting to "internlm2.5-latest". + do_sample : bool, optional + Whether to use sampling during text generation. Default is True. + temperature : float, optional + Sampling temperature to control randomness. Default is 0.95. + top_p : float, optional + Nucleus sampling parameter to control randomness. Default is 0.7. + max_tokens : int, optional + The maximum number of tokens to generate. Default is 4095. + + Returns + ------- + str + The generated text content from the model. + + Raises + ------ + InvalidPromptError + If both system and user prompts are None. + """ + + response = self._execute_request(system_prompt, user_prompt, model, do_sample, temperature, top_p, max_tokens) + return response + + def send_request(self, request: Dict[str, Any]) -> Dict[str, Any]: + """ + Send an HTTP request to the InternLM API. + + Parameters + ---------- + request : dict + A dictionary containing the request data, including the URL, headers, and JSON body. + + Returns + ------- + dict + The response from the API in the form of a dictionary. + + Raises + ------ + HttpClientError + If any error occurs during the HTTP request process. + """ + + response = self._send_request(request=request) + return response + + def _send_request(self, request: Dict[str, Any]) -> Dict[str, Any]: + """ + Internal method to send an HTTP POST request to the InternLM API. + + This method handles the actual HTTP POST request and manages error handling + for issues like connection failures, timeouts, and JSON decoding errors. + + Parameters + ---------- + request : dict + A dictionary containing the request data, including the URL, headers, and JSON body. + + Returns + ------- + dict + The JSON response from the API. + + Raises + ------ + HttpClientError + If an error occurs during the HTTP request. + """ + + url = request.get("url", "https://internlm-chat.intern-ai.org.cn/puyu/api/v1/chat/completions") + headers = {**request.get("headers", {})} + json = request.get("json", {}) + try: + response = requests.post( + url=url, + headers=headers, + json=json, + timeout=30, + ) + + response.raise_for_status() + return response.json()["choices"][0]["message"]["content"].strip() + + except requests.exceptions.HTTPError as e: + # Handle HTTP error exceptions + status_code = e.response.status_code + logger.error( + "HTTP error occurred. Status Code: %s, URL: %s, Message: %s", + status_code, + url, + e, + ) + + return {"error": "HTTP error", "status_code": status_code, "message": str(e)} + + + except requests.exceptions.ConnectionError as e: + # Handle connection errors + logger.error("Connection error occurred while connecting to %s: %s", url, e) + + return {"error": "Connection error", "message": str(e)} + + except requests.exceptions.Timeout as e: + # Handle timeout errors + logger.error("Timeout occurred while sending request to %s: %s", url, e) + + return {"error": "Timeout", "message": str(e)} + + except requests.exceptions.RequestException as e: + # Handle any generic request exceptions + logger.error( + "Request exception occurred while sending request to %s: %s", url, e + ) + + return {"error": "Request exception", "message": str(e)} + + except ValueError as e: + # Handle JSON decoding errors + logger.error("JSON decoding error occurred: %s", e) + + return {"error": "JSON decoding error", "message": str(e)} + + except Exception as e: + # Catch any unexpected errors + logger.critical( + "An unexpected error occurred while sending request to %s: %s", url, e + ) + + return {"error": "Unexpected error", "message": str(e)} + + def _execute_request( + self + , system_prompt: str = None + , user_prompt: str = None + , model: str = "glm-4-flash" + , do_sample: bool = True + , temperature: float = 0.95 + , top_p: float = 0.7 + , max_tokens: int = 4095 + ) -> str: + """ + Internal method to prepare and execute the API request for text generation. + + Parameters + ---------- + system_prompt : str, optional + The system-level prompt providing context for the text generation. Default is None. + user_prompt : str, optional + The user-provided prompt initiating the text generation. Default is None. + model : str, optional + The specific model to use for text generation. Default is "internlm2.5-latest". + do_sample : bool, optional + Whether to use sampling during text generation. Default is True. + temperature : float, optional + Sampling temperature to control randomness. Default is 0.95. + top_p : float, optional + Nucleus sampling parameter to control randomness. Default is 0.7. + max_tokens : int, optional + The maximum number of tokens to generate. Default is 4095. + + Returns + ------- + str + The generated text content from the model. + + Raises + ------ + InvalidPromptError + If both system and user prompts are None. + """ + + if (system_prompt is None and user_prompt is None): + logger.error("prompt不能同时为空") + raise InvalidPromptError("prompt不能同时为空") + + request_data = { + "url": f"{self.base_url}", + "headers": { + "Authorization": f"Bearer {self.api_key}", + "Content-Type": "application/json", + }, + "json": { + "model": model, + "messages": [ + { + "role": "system", + "content": system_prompt, + }, + { + "role": "user", + "content": user_prompt, + } + ], + "temperature": temperature, + "top_p": top_p, + "max_tokens": max_tokens + }, + } + + response = self.send_request(request_data) + return response diff --git a/build/lib/edg4llm/processor/__init__.py b/build/lib/edg4llm/processor/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/build/lib/edg4llm/processor/postprocess.py b/build/lib/edg4llm/processor/postprocess.py new file mode 100644 index 0000000..fd18904 --- /dev/null +++ b/build/lib/edg4llm/processor/postprocess.py @@ -0,0 +1,231 @@ +import json +from typing import Dict, List, Any + +from edg4llm.utils.logger import custom_logger + +logger = custom_logger("PostProcessor") + +class PostProcessor: + """ + A class for post-processing conversation and question data. + + This class provides methods to clean and structure raw data obtained from API responses or external sources. + It handles the removal of unnecessary markdown formatting, parses the data into valid JSON format, and + structures it for further use in applications such as chatbots or AI assistants. It can also incorporate + an optional system prompt into the processed data for context. + + Methods + ------- + dialogue_postprocessing(conversation_data: Dict[str, str], system_prompt: str = None): + Processes raw conversation data by cleaning, parsing, and adding an optional system prompt. + + question_postprocessing(question_data: str = None): + Processes raw question data by cleaning and structuring it into a list of questions. + + answer_postprocessing(question: str, answer: str, system_prompt: str = None): + Processes raw answer data by cleaning, parsing, and structuring it along with the question + and an optional system prompt. + """ + + def __init__(self): + pass + + def dialogue_postprocessing(self, conversation_data: Dict[str, str], system_prompt: str = None): + """ + Post-process conversation data. + + This function processes raw conversation data by removing unnecessary formatting and parsing it + into a valid JSON format. If a system-level prompt (system_prompt) is provided, it will be added + as an "instruction" field to the first conversation entry. The processed data is returned as a + dictionary with a "conversation" key. + + Parameters + ---------- + conversation_data : str + The raw conversation data in string format, typically from an API response or an external source. + It may contain markdown-style formatting such as "```json" or "```" that needs to be removed. + + system_prompt : str, optional + An optional system-level prompt that will be added to the "instruction" field of the first + conversation entry. If not provided, an empty string will be used. Default is None. + + Returns + ------- + dict or None + Returns a dictionary containing the processed conversation data structured under the "conversation" key. + Each item in the list corresponds to a conversation entry. If an error occurs during JSON parsing, + the function logs the error and returns None. + + Examples + -------- + >>> conversation_data = ''' + [ + {"input": "AAA", "output": "BBBB"}, + {"input": "CCC", "output": "DDDD"} + ] + ''' + >>> system_prompt = "You are a helpful assistant." + >>> processed_data = postprocessing(conversation_data, system_prompt) + + >>> # Output: + >>> { + "conversation": [ + {"input": "AAA", "output": "BBBB", "instruction": "You are a helpful assistant."}, + {"input": "CCC", "output": "DDDD"} + ] + } + + Notes + ----- + - The function removes any markdown formatting (like "```json" or "```") before parsing the data. + - If JSON parsing fails, an error is logged, and the function returns None. + """ + try: + # Clean and parse the JSON conversation data + conversation_data = json.loads(conversation_data.replace("```json", "").replace("```", "")) + except Exception as exception: + logger.error("Error parsing JSON: %s", str(exception)) + return None + + # Initialize the result dictionary with a "conversation" key + result = {"conversation": []} + + # Add the system prompt as an instruction to the first conversation entry if provided + for idx, data in enumerate(conversation_data): + if idx == 0: + data["instruction"] = system_prompt if system_prompt is not None else "" + result["conversation"].append(data) + + return result + + + def question_postprocessing(self, question_data: str = None): + """ + Post-process the question data. + + This function processes raw question data by removing unnecessary formatting and ensuring + it is in a valid JSON format. It converts each question into a structured dictionary with + the key "question" holding the processed content. + + Parameters + ---------- + question_data : str + The raw question data in string format, typically from an API response or external source. + The string may contain markdown-style formatting such as "```json" or "```" that should be removed. + + Returns + ------- + dict or None + Returns a dictionary with the format {"question": }. + If an error occurs during JSON parsing, it returns None. + + Examples + -------- + >>> question_data = "What is your name?" + >>> processed_data = question_postprocessing(question_data) + >>> print(processed_data) + Output: {'question': 'What is your name?'} + + Notes + ----- + - This function removes any markdown formatting (e.g., "```json" or "```") from the input string. + - If an exception occurs during JSON parsing, an error message is logged, and the function returns None. + """ + + try: + # Clean up and parse the JSON question data + question_data = json.loads(question_data.replace("```json", "").replace("```", "")) + except Exception as exception: + logger.error("Error parsing JSON: %s", str(exception)) + return None + + # Initialize the result with a "question" key + result = [] + + # Extract the question and assign it to the result + for _, data in enumerate(question_data): + result.append(data) + + return result + + def answer_postprocessing(self, question: str, answer: str, system_prompt: str = None): + """ + Post-process conversation data. + + This function processes raw conversation data by parsing it into a valid JSON format and structuring + it into a predefined format. It also adds an optional system prompt to each conversation entry + under the "instruction" key. The processed data is returned as a dictionary wrapped in a list. + + Parameters + ---------- + question : str + The input question or query from the user. + + answer : str + The raw answer data in string format, typically containing JSON content. + This string may contain markdown formatting (e.g., "```json" or "```") that needs to be removed. + + system_prompt : str, optional + An optional system-level prompt to provide context or instructions. This will be added to + each conversation entry under the "instruction" key. Default is None. + + Returns + ------- + list or None + Returns a list containing a dictionary with the processed conversation data. + The dictionary has a "conversation" key, which is a list of conversation entries. + Each entry contains "input", "output", and "instruction" keys. + If an error occurs during JSON parsing, the function logs the error and returns None. + + Examples + -------- + >>> # Input: + >>> question = "What is AI?" + >>> answer = ''' + [ + { + "input": question, + "output": "BBB" + } + ] + ''' + >>> system_prompt = "You are a helpful assistant." + + >>> # Function Call: + >>> processed_data = answer_postprocessing(question, answer, system_prompt) + + >>> # Output: + >>> [ + { + "conversation": [ + { + "input": "What is AI?", + "output": "BBB", + "instruction": "You are a helpful assistant." + } + ] + } + ] + + Notes + ----- + - The function removes any markdown formatting (like "```json" or "```") before parsing the data. + - If JSON parsing fails, the function logs an error and returns None. + - The output is wrapped in a list to allow for future extensibility. + """ + + try: + # Clean up and parse the JSON conversation data + conversation_data = json.loads(answer.replace("```json","").replace("```","")) + except Exception as exception: + logger.error("Error parsing JSON: %s", str(exception)) + return None + + # Initialize the result with a conversation key + result = {"conversation": []} + conversation = {"instruction" : system_prompt, "input" : question} + # Add the system prompt to the first conversation entry if provided + for idx, data in enumerate(conversation_data): + conversation['output'] = data["answer"] + result["conversation"].append(conversation) + return result diff --git a/build/lib/edg4llm/processor/preprocess.py b/build/lib/edg4llm/processor/preprocess.py new file mode 100644 index 0000000..a8ebe1e --- /dev/null +++ b/build/lib/edg4llm/processor/preprocess.py @@ -0,0 +1,139 @@ +import re +import sys +import json + +from edg4llm.utils.logger import custom_logger +from edg4llm.utils.data_utils import is_question_template_consistent +from edg4llm.utils.data_utils import is_answer_template_consistent +from edg4llm.utils.data_utils import is_dialogue_template_consistent + +from edg4llm.utils.template import Template + +logger = custom_logger("preprocess") + +class PreProcessor: + """ + A class for pre-processing user prompts before data generation. + + This class provides methods to validate and repair user prompts in different modes such as question, + answer, and dialogue. If a user prompt does not match the expected template, the methods automatically + append the corresponding format guidelines to ensure consistency. + + Methods + ------- + question_preprocess(user_prompt: str) -> str: + Validates and repairs user prompts in question mode. + + answer_preprocess(user_prompt: str) -> str: + Validates and repairs user prompts in answer mode. + + dialogue_preprocess(user_prompt: str) -> str: + Validates and repairs user prompts in Q&A (dialogue) mode. + """ + def __init__(self): + pass + + def question_preprocess(self, language: str, user_prompt: str) -> str: + """ + Validates and processes user prompts in question mode. + + Parameters + ---------- + language : str + The language of data in data generation. Must be one of 'zh', 'en'. + + user_prompt : str + The user's input prompt to be processed in question mode. + + Returns + ------- + str + The validated and, if necessary, repaired user prompt. + + Notes + ----- + - If the user prompt matches the question template, it is returned unchanged. + - If the user prompt does not match, format guidelines from `Template.question_template` + are appended to the prompt. + """ + + if is_question_template_consistent(user_prompt=user_prompt): + logger.info("User prompt matches the question template. Proceeding with data generation.") + return user_prompt + else: + logger.warning("User prompt does not match the question template. Automatically added format guidelines.") + if language == "zh": + repaired_user_prompt = user_prompt + '\n' + Template.question_zh_template + else: + repaired_user_prompt = user_prompt + '\n' + Template.question_en_template + return repaired_user_prompt + + def answer_preprocess(self, language: str, user_prompt: str) -> str: + """ + Validates and processes user prompts in answer mode. + + Parameters + ---------- + language : str + The language of data in data generation. Must be one of 'zh', 'en'. + + user_prompt : str + The user's input prompt to be processed in answer mode. + + Returns + ------- + str + The validated and, if necessary, repaired user prompt. + + Notes + ----- + - If the user prompt matches the answer template, it is returned unchanged. + - If the user prompt does not match, format guidelines from `Template.answer_template` + are appended to the prompt. + """ + + if is_answer_template_consistent(user_prompt=user_prompt): + logger.info("User prompt matches the answer template. Proceeding with data generation.") + return user_prompt + else: + logger.warning("User prompt does not match the answer template. Automatically added format guidelines.") + if language == "zh": + repaired_user_prompt = user_prompt + '\n' + Template.answer_zh_template + else: + repaired_user_prompt = user_prompt + '\n' + Template.answer_en_template + return repaired_user_prompt + + def dialogue_preprocess(self, language: str, user_prompt: str) -> str: + """ + Validates and processes user prompts in Q&A (dialogue) mode. + + Parameters + ---------- + language : str + The language of data in data generation. Must be one of 'zh', 'en'. + + user_prompt : str + The user's input prompt to be processed in Q&A mode. + + Returns + ------- + str + The validated and, if necessary, repaired user prompt. + + Notes + ----- + - If the user prompt matches the dialogue template, it is returned unchanged. + - If the user prompt does not match, format guidelines from `Template.dialogue_template` + are appended to the prompt. + """ + + if is_dialogue_template_consistent(user_prompt=user_prompt): + logger.info("User prompt matches the dialogue template. Proceeding with data generation.") + return user_prompt + else: + logger.warning("User prompt does not match the dialogue template. Automatically added format guidelines.") + if language == "zh": + repaired_user_prompt = user_prompt + '\n' + Template.dialogue_zh_template + else: + repaired_user_prompt = user_prompt + '\n' + Template.dialogue_en_template + return repaired_user_prompt diff --git a/build/lib/edg4llm/utils/__init__.py b/build/lib/edg4llm/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/build/lib/edg4llm/utils/config.py b/build/lib/edg4llm/utils/config.py new file mode 100644 index 0000000..a4534eb --- /dev/null +++ b/build/lib/edg4llm/utils/config.py @@ -0,0 +1,8 @@ +import dataclasses + +@dataclasses +class DefaultConfig: + """ + A placeholder class for default configuration settings. + """ + pass diff --git a/build/lib/edg4llm/utils/data_utils.py b/build/lib/edg4llm/utils/data_utils.py new file mode 100644 index 0000000..d928539 --- /dev/null +++ b/build/lib/edg4llm/utils/data_utils.py @@ -0,0 +1,157 @@ +import json +import re +from typing import Dict, List, Any + +def is_question_template_consistent(user_prompt: str) -> bool: + """ + Check if the user prompt contains a consistent question JSON template. + + Parameters + ---------- + user_prompt : str + The user-provided prompt to be validated. + + Returns + ------- + bool + True if the user prompt contains a valid and consistent question JSON template, + False otherwise. + + Notes + ----- + - The function uses a regular expression to extract the JSON template and compares it + with the target template. + - The target template is: + [ + { + "question": "AAA" + } + ] + - Returns False if the JSON extraction or comparison fails. + """ + target_template = [ + { + "question": "AAA" + } + ] + + # Regular expression to extract JSON template + pattern = r"\[\s*{\s*\"question\"\s*:\s*\"AAA\"\s*}\s*\]" + match = re.search(pattern, user_prompt) + + if match: + try: + extracted_template = json.loads(match.group(0)) + except json.JSONDecodeError: + return False + return extracted_template == target_template + return False + +def is_answer_template_consistent(user_prompt: str) -> bool: + """ + Check if the user prompt contains a consistent answer JSON template. + + Parameters + ---------- + user_prompt : str + The user-provided prompt to be validated. + + Returns + ------- + bool + True if the user prompt contains a valid and consistent answer JSON template, + False otherwise. + + Notes + ----- + - The function uses a regular expression to extract the JSON template and compares it + with the target template. + - The target template is: + [ + { + "answer": "AAA" + } + ] + - Returns False if the JSON extraction or comparison fails. + """ + target_template = [ + { + "answer": "AAA" + } + ] + + # Regular expression to extract JSON template + pattern = r"\[\s*{\s*\"answer\"\s*:\s*\"AAA\"\s*}\s*\]" + match = re.search(pattern, user_prompt) + + if match: + try: + extracted_template = json.loads(match.group(0)) + except json.JSONDecodeError: + return False + return extracted_template == target_template + return False + +def is_dialogue_template_consistent(user_prompt: str) -> bool: + """ + Check if the user prompt contains a consistent dialogue JSON template. + + Parameters + ---------- + user_prompt : str + The user-provided prompt to be validated. + + Returns + ------- + bool + True if the user prompt contains a valid and consistent dialogue JSON template, + False otherwise. + + Notes + ----- + - The function uses a regular expression to check for the dialogue JSON structure. + - The expected template format is: + [ + { + "input": "AAA", + "output": "BBB" + } + ] + """ + + pattern = r"\[\s*\{\{\s*\"input\"\s*:\s*\"AAA\"\s*,\s*\"output\"\s*:\s*\"BBB\"\s*\}\}\s*\]" + match = re.search(pattern, user_prompt) + return match is not None + +def save_data_to_json(data: List[Dict], output_path: str): + """ + Save a list of dictionaries to a JSON file. + + Parameters + ---------- + data : list of dict + A list of dictionaries to be saved to a JSON file. Each dictionary should contain + the data to be written. + + output_path : str + The path (including the filename) where the JSON data will be saved. + The file will be written in UTF-8 encoding. + + Returns + ------- + None + This function does not return any value. It saves the data to the specified file. + + Examples + -------- + >>> data = [{"name": "John", "age": 30}, {"name": "Jane", "age": 25}] + >>> save_data_to_json(data, "output.json") + + Notes + ----- + - The function uses `json.dump` to write the data to the file. + - Non-ASCII characters are preserved with the `ensure_ascii=False` argument. + - The file will be saved with an indentation of 4 spaces to make it human-readable. + """ + with open(output_path, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) diff --git a/build/lib/edg4llm/utils/exceptions.py b/build/lib/edg4llm/utils/exceptions.py new file mode 100644 index 0000000..515dd57 --- /dev/null +++ b/build/lib/edg4llm/utils/exceptions.py @@ -0,0 +1,35 @@ +from typing import Optional + + +class HttpClientError(Exception): + """ + Exception raised for errors encountered in the HTTP client. + + Parameters + ---------- + message : str + A detailed error message describing the issue. + status_code : Optional[int], optional + The HTTP status code associated with the error, by default None. + + Attributes + ---------- + status_code : Optional[int] + The HTTP status code associated with the error. + """ + + def __init__(self, message: str, status_code: Optional[int] = None): + super().__init__(message) + self.status_code = status_code + + +class InvalidPromptError(Exception): + """ + Custom exception raised when an invalid or empty prompt is encountered. + + Notes + ----- + This exception is intended to handle cases where a required prompt input + is missing or invalid. + """ + pass diff --git a/build/lib/edg4llm/utils/logger.py b/build/lib/edg4llm/utils/logger.py new file mode 100644 index 0000000..de430c1 --- /dev/null +++ b/build/lib/edg4llm/utils/logger.py @@ -0,0 +1,104 @@ +import datetime +import logging + +__all__ = ['custom_logger'] + +# Define log level colors for terminal output +LOG_COLORS = { + 'DEBUG': '\033[96m', # Cyan + 'INFO': '\033[92m', # Green + 'WARNING': '\033[93m', # Yellow + 'ERROR': '\033[91m', # Red + 'CRITICAL': '\033[1;91m', # Bold Red + 'RESET': '\033[0m', # Reset color +} + +def custom_logger(name: str): + """ + Creates a custom logger with color-coded log levels and UTC+8 time formatting. + + Parameters + ---------- + name : str + The name of the logger, typically the name of the module or application. + + Returns + ------- + logging.Logger + A customized logger instance with color-coded levels and UTC+8 timezone support. + + Notes + ----- + - Log levels are color-coded for easier readability in terminal output. + - Log messages use UTC+8 timezone formatting. + - The logger prevents propagation to root loggers and clears existing handlers. + - The logger uses a custom `StreamHandler` with color support. + """ + # Create a logger instance + logger = logging.getLogger(name) + logger.setLevel(logging.INFO) # Default log level + logger.propagate = False # Disable propagation to root loggers + logger.handlers = [] # Clear any existing handlers + + # Define a custom log message format + formatter = logging.Formatter( + '[%(asctime)s]-[%(name)s:%(levelname)s]:%(message)s' + ) + + # Custom time converter to use UTC+8 + def _utc8_aera(timestamp): + """ + Convert a timestamp to a UTC+8 time tuple. + + Parameters + ---------- + timestamp : float + The timestamp to convert. + + Returns + ------- + time.struct_time + A time tuple in UTC+8 timezone. + """ + now = datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc) + datetime.timedelta(hours=8) + return now.timetuple() + + # Set the custom time converter in the formatter + formatter.converter = _utc8_aera + + # Define a custom StreamHandler with color-coded log levels + class ColorStreamHandler(logging.StreamHandler): + """ + A custom logging stream handler that adds color coding to log messages. + + Methods + ------- + emit(record): + Formats and outputs a log record with color coding based on log level. + """ + def emit(self, record): + """ + Format and emit a log record with color coding. + + Parameters + ---------- + record : logging.LogRecord + The log record to process and output. + """ + try: + msg = self.format(record) # Format the log record + color = LOG_COLORS.get(record.levelname, LOG_COLORS['RESET']) # Get the color for the log level + # Write the log message with color + self.stream.write(f"{color}{msg}{LOG_COLORS['RESET']}\n") + self.flush() # Flush the stream + except Exception: + self.handleError(record) # Handle any errors during logging + + # Create and configure the custom handler + custom_handler = ColorStreamHandler() + custom_handler.setFormatter(formatter) + + # Add the custom handler to the logger + logger.addHandler(custom_handler) + + return logger diff --git a/build/lib/edg4llm/utils/template.py b/build/lib/edg4llm/utils/template.py new file mode 100644 index 0000000..3115009 --- /dev/null +++ b/build/lib/edg4llm/utils/template.py @@ -0,0 +1,113 @@ +from dataclasses import dataclass + +@dataclass +class Template: + """ + A class to define language-specific templates for user prompts, providing a strict JSON format + to preprocess user input. If the user's prompt does not include format instructions, the + appropriate template will be added to enforce the required structure. + + Attributes: + ---------- + question_zh_template : str + A JSON format template for Chinese question prompts. Ensures that generated questions + are returned in a JSON format with a "question" field. + + answer_zh_template : str + A JSON format template for Chinese answer prompts. Ensures that generated answers + are returned in a JSON format with an "answer" field. + + dialogue_zh_template : str + A JSON format template for Chinese dialogue prompts. Ensures that the interaction is + returned in a JSON format with "input" representing the question and "output" representing + the response. + + question_en_template : str + A JSON format template for English question prompts. Ensures that generated questions + are returned in a JSON format with a "question" field. + + answer_en_template : str + A JSON format template for English answer prompts. Ensures that generated answers + are returned in a JSON format with an "answer" field. + + dialogue_en_template : str + A JSON format template for English dialogue prompts. Ensures that the interaction is + returned in a JSON format with "input" representing the question and "output" representing + the response. + + Notes: + ----- + This class is designed for preprocessing user prompts. If a user's input does not include + specific format instructions, the appropriate template (based on language) is appended to + the user prompt to ensure compliance with the required JSON format. + """ + + question_zh_template = \ + """ + 严格遵循规则: 请以如下格式返回生成的数据, 只返回JSON格式,json模板: + [ + { + "question":"AAA" + } + ] + 其中question字段表示生成的问题 + """ + + answer_zh_template = \ + """ + 严格遵循规则: 请以如下格式返回生成的数据, 只返回JSON格式,json模板: + [ + { + "answer":"AAA" + } + ] + 其中answer字段表示生成的答案 + """ + + dialogue_zh_template = \ + """ + 严格遵循规则: 请以如下格式返回生成的数据, 只返回JSON格式,json模板: + [ + {{ + "input":"AAA","output":"BBB" + }} + ] + 其中input字段表示问题, output字段回答 + """ + + question_en_template = \ + """ + Strictly follow the rules: Please return the generated data in the following format, + only in JSON format. JSON template: + [ + { + "question":"AAA" + } + ] + The "question" field represents the generated question. + """ + + answer_en_template = \ + """ + Strictly follow the rules: Please return the generated data in the following format, + only in JSON format. JSON template: + [ + { + "answer":"AAA" + } + ] + The "answer" field represents the generated answer. + """ + + dialogue_en_template = \ + """ + Strictly follow the rules: Please return the generated data in the following format, + only in JSON format. JSON template: + [ + {{ + "input":"AAA","output":"BBB" + }} + ] + The "input" field represents the question, and the "output" field + represents the answer. + """ diff --git a/dist/edg4llm-1.0.14-py3-none-any.whl b/dist/edg4llm-1.0.14-py3-none-any.whl new file mode 100644 index 0000000..88568a1 Binary files /dev/null and b/dist/edg4llm-1.0.14-py3-none-any.whl differ diff --git a/dist/edg4llm-1.0.14.tar.gz b/dist/edg4llm-1.0.14.tar.gz new file mode 100644 index 0000000..6138b74 Binary files /dev/null and b/dist/edg4llm-1.0.14.tar.gz differ diff --git a/edg4llm.egg-info/PKG-INFO b/edg4llm.egg-info/PKG-INFO new file mode 100644 index 0000000..02809bc --- /dev/null +++ b/edg4llm.egg-info/PKG-INFO @@ -0,0 +1,281 @@ +Metadata-Version: 2.1 +Name: edg4llm +Version: 1.0.14 +Summary: A unified tool to generate fine-tuning datasets for LLMs, including questions, answers, and dialogues. +Home-page: https://github.com/alannikos/edg4llm +Author: Alannikos +Author-email: alannikos768@outlook.com +Keywords: LLM fine-tuning data-generation AI NLP +Classifier: Programming Language :: Python :: 3 +Classifier: License :: OSI Approved :: MIT License +Classifier: Operating System :: OS Independent +Classifier: Intended Audience :: Developers +Classifier: Topic :: Software Development :: Libraries +Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence +Requires-Python: >=3.8 +Description-Content-Type: text/markdown +License-File: LICENSE +Requires-Dist: requests>=2.32.3 + +# EDG4LLM + +
+ + +``` + __ __ __ __ ___ __ __ __ __ +| | |_ | / / \ |\/| |_ | / \ |_ | \ / _ |__| | | |\/| +|/\| |__ |__ \__ \__/ | | |__ | \__/ |__ |__/ \__) | |__ |__ | | + +``` + + +
+ +
+ +[📘Documentation](https://github.com/Alannikos/FunGPT) | +[🛠️Quick Start](https://github.com/Alannikos/FunGPT) | +[🤔Reporting Issues](https://github.com/Alannikos/FunGPT/issues) + +
+ +
+ + +[![GitHub Issues](https://img.shields.io/github/issues/Alannikos/edg4llm?style=flat&logo=github&color=%23FF5252)](https://github.com/Alannikos/edg4llm/issues) +[![GitHub forks](https://img.shields.io/github/forks/Alannikos/edg4llm?style=flat&logo=github&color=%23FF9800)](https://github.com/Alannikos/edg4llm/forks) +![GitHub Repo stars](https://img.shields.io/github/stars/Alannikos/edg4llm?style=flat&logo=github&color=%23FFEB3B) +![GitHub License](https://img.shields.io/github/license/Alannikos/edg4llm?style=flat&logo=github&color=%234CAF50) +[![Discord](https://img.shields.io/discord/1327445853388144681?style=flat&logo=discord)](https://discord.com/channels/1327445853388144681/) +[![Bilibili](https://img.shields.io/badge/dynamic/json?url=https%3A%2F%2Fapi.bilibili.com%2Fx%2Frelation%2Fstat%3Fvmid%3D3494365446015137&query=%24.data.follower&style=flat&logo=bilibili&label=followers&color=%23FF69B4)](https://space.bilibili.com/3494365446015137) +[![PyPI - Version](https://img.shields.io/pypi/v/edg4llm?style=flat&logo=pypi&logoColor=blue&color=red)](https://pypi.org/project/edg4llm/) +[![PyPI - Downloads](https://img.shields.io/pypi/dm/edg4llm?color=blue&logo=pypi&logoColor=gold)](https://pypi.org/project/edg4llm/) +[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/edg4llm?logo=python&logoColor=gold)](https://pypi.org/project/edg4llm/) +
+ + +**Easy Data Generation For Large Language Model(abbreviated as EDG4LLM)**, A unified tool to generate fine-tuning datasets for LLMs, including questions, answers, and dialogues. + + +## Latest News + +
+2025 + +- [2025/01/11] 👋👋 We are excited to announce [**the initial release of edg4llm v1.0.12**](https://pypi.org/project/edg4llm/1.0.12/), marking the completion of its core functionalities. + +
+ +## Table of Contents +- [Latest News](#latest-news) +- [Introduction](#introduction) +- [Features](#features) +- [Installation](#installation) +- [Quick Start](#quick-start) +- [Requirements](#requirements) +- [License](#license) +- [Future Development Plans](#future-development-plans) +- [Acknowledgments](#acknowledgments) +- [License](#license) +- [Contact us](#contact-me) +- [Star History](#star-history) + +## Introduction +**edg4llm** is a Python library designed specifically for generating fine-tuning data using large language models. This tool aims to assist users in creating high-quality training datasets efficiently. At its current stage, it mainly supports text data generation. The generated data includes, but is not limited to: +- **Question data** +- **Answer data** +- **Dialogue data** + +With **edg4llm**, users can easily produce diverse datasets tailored to fine-tuning requirements, significantly enhancing the performance of large language models in specific tasks. +## Features +EDG4LLM is a unified tool designed to simplify and accelerate the creation of fine-tuning datasets for large language models. With a focus on usability, efficiency, and adaptability, it offers a range of features to meet diverse development needs while ensuring seamless integration and robust debugging support. + +1. **Simple to Use**: Provides a straightforward interface that allows users to get started without complex configurations. +2. **Lightweight**: Minimal dependencies and low resource consumption make it efficient and easy to use. +3. **Flexibility**: Supports a variety of data formats and generation options, allowing customization to meet specific needs. +4. **Compatibility**: Seamlessly integrates with mainstream large language models and is suitable for various development scenarios. +5. **Transparent Debugging**: Provides clear and detailed log outputs, making it easy to debug and trace issues effectively. + +## Installation +To install **edg4llm**, simply run the following command in your terminal: + + +```bash +pip install edg4llm +``` + +### Supported Python Versions +- **Supported Python Versions**: Python 3.8 or higher is required for compatibility with this library. Ensure your environment meets this version requirement. + +### Supported LLM Provider +The current version of edg4llm supports the following large language model providers: +- [**InternLM**](https://github.com/InternLM) + - Developer: Developed by the Shanghai Artificial Intelligence Laboratory. + - Advantages: InternLM is a series of open-source large language models that offer outstanding reasoning, long-text processing, and tool usage capabilities. + +- [**ChatGLM**](https://github.com/THUDM/) + - Developer: Jointly developed by Tsinghua University and Zhipu AI. + - Advantages: ChatGLM is an open-source, bilingual dialog language model based on the General Language Model (GLM) architecture. It has been trained on a large corpus of Chinese and English text, making it highly effective for generating natural and contextually relevant responses. +- [**DeepSeek**](https://github.com/deepseek-ai/) + - Developer: Developed by the DeepSeek team. + - Advantages: DeepSeek-V3 is a powerful and cost-effective open-source large language model. It offers top-tier performance, especially in tasks like language generation, question answering, and dialog systems. +- [**OpenAI ChatGPT**](https://chatgpt.com/) + - Developer: Developed by OpenAI. + - Advantages: OpenAI's ChatGPT is a highly advanced language model known for its robust text generation capabilities. It has been trained on a vast amount of data, allowing it to generate high-quality and contextually relevant responses. + +More providers will be added in future updates to extend compatibility and functionality. + +| **Model** | **Free** | **Base URL** | +|--------------------|------------------|------------------------------------------------------------| +| **InternLM** | Yes(Partly) | `https://internlm-chat.intern-ai.org.cn/puyu/api/v1/chat/completions` | +| **ChatGLM** | Yes(Partly) | `https://open.bigmodel.cn/api/paas/v4/chat/completions/` | +| **DeepSeek** | Yes(Free Trial for New Users) | `https://api.deepseek.com/chat/completions` | +| **OpenAI ChatGPT** | No (Paid Plans) | `https://api.openai.com/v1/chat/completions` | + + +## Quick Start + +To get started with **edg4llm**, follow the steps below. This example demonstrates how to use the library to generate dialogue data based on a specific prompt. + +### Prerequisites + +1. Install the **edg4llm** package: +```bash + pip install edg4llm +``` + +2. Ensure you have Python version 3.8 or higher. + +3. Obtain the necessary API key and base URL for your chosen model provider (e.g., ChatGLM). + +### Code Example(Chinese Version) +```python +# chatglm_demo.py + +import edg4llm +print(edg4llm.__version__) + +from edg4llm import EDG4LLM + +api_key = "xxx" +base_url = "https://open.bigmodel.cn/api/paas/v4/chat/completions" + +edg = EDG4LLM(model_provider='chatglm', model_name="glm-4-flash", base_url=base_url, api_key=api_key) +# 设置测试数据 +system_prompt = """你是一个精通中国古代诗词的古文学大师""" + +user_prompt = """ + 目标: 1. 请生成过年为场景的连续多轮对话记录 + 2. 提出的问题要多样化。 + 3. 要符合人类的说话习惯。 + 4. 严格遵循规则: 请以如下格式返回生成的数据, 只返回JSON格式,json模板: + [ + {{ + "input":"AAA","output":"BBB" + }} + ] + 其中input字段表示一个人的话语, output字段表示专家的话语 +""" +num_samples = 1 # 只生成一个对话样本 + +# 调用 generate 方法生成对话 +data_dialogue = edg.generate( + task_type="dialogue", + system_prompt=system_prompt, + user_prompt=user_prompt, + num_samples=num_samples +) +``` +### Code Example(English Version) +```python +# chatglm_demo.py + +import edg4llm +print(edg4llm.__version__) + +from edg4llm import EDG4LLM + +api_key = "xxx" +base_url = "https://open.bigmodel.cn/api/paas/v4/chat/completions" + +edg = EDG4LLM(model_provider='chatglm', model_name="glm-4-flash", base_url=base_url, api_key=api_key) + +# Set the test data +system_prompt = """You are a master of ancient Chinese literature, specializing in classical poetry.""" + +user_prompt = """ + Goal: 1. Please generate a multi-turn dialogue set in the context of celebrating the Lunar New Year. + 2. The questions should be diverse. + 3. The dialogue should align with natural human conversational habits. + 4. Strictly follow this rule: Please return the generated data in the following format, only in JSON format. JSON template: + [ + {{ + "input":"AAA","output":"BBB" + }} + ] + Where the input field represents a person's dialogue, and the output field represents the expert's response. +""" +num_samples = 1 # Generate only one dialogue sample + +# Call the generate method to generate the dialogue +data_dialogue = edg.generate( + task_type="dialogue", + system_prompt=system_prompt, + user_prompt=user_prompt, + num_samples=num_samples +) + +``` + +### Explanation + +1. Importing the Library: Import the edg4llm library and verify the version using print(edg4llm.__version__). + +2. Initialization: Use EDG4LLM to initialize the library with the appropriate model provider, model name, base URL, and API key. + +3. Prompts: + - system_prompt defines the behavior or role of the assistant. + - user_prompt provides specific instructions for generating data. +4. Data Generation: +Use the generate method with the following parameters: + - task_type: Defines the type of task (e.g., dialogue, question-answering). + - system_prompt and user_prompt: Provide context and task-specific instructions. + - num_samples: Specifies how many samples to generate. +5. Output: The generated data is returned as a JSON object in the specified format. + +## Requirements +This project has **minimal dependencies**, requiring only the requests library. Make sure to have the following version installed: + +- requests>=2.32.3 + +## Future Development Plans +1. - [ ] Recording Introduction Video +2. - [ ] Support Gemini2 +3. - [ ] Support local large language models +4. - [ ] Support other types of data, such as picture. + +## Acknowledgments +| Project | Description | +|---|---| +| [FunGPT](https://github.com/Alannikos/FunGPT) | An open-source Role-Play project | +| [InternLM](https://github.com/InternLM/InternLM) | A series of advanced open-source large language models | +| [ChatGLM](https://github.com/THUDM/) | A bilingual dialog language model based on the General Language Model (GLM) architecture, jointly developed by Tsinghua University and Zhipu AI. | +| [DeepSeek](https://github.com/deepseek-ai/) | A powerful and cost-effective open-source large language model, excelling in tasks such as language generation, question answering, and dialog systems. | +| [ChatGPT](https://openai.com/chatgpt/) | A highly advanced language model developed by OpenAI, known for its robust text generation capabilities. | + +## License +MIT License - See [LICENSE](LICENSE) for details. + +## Contact Me +Thank you for using **EDG4LLM**! Your support and feedback are invaluable in making this project better. + +If you encounter any issues, have suggestions, or simply want to share your thoughts, feel free to: +- Submit an Issue: Visit the [Issues Page](https://github.com/Alannikos/edg4llm/issues) and describe the problem or suggestion. +- Email Me: You can also reach out directly via email at alannikos768@outlook.com. I'll do my best to respond promptly. + +Your contributions and feedback are greatly appreciated. Thank you for helping improve this tool! + +## Star History + +[![Star History Chart](https://api.star-history.com/svg?repos=Alannikos/edg4llm&type=Date)](https://star-history.com/#Alannikos/edg4llm&Date) diff --git a/edg4llm.egg-info/SOURCES.txt b/edg4llm.egg-info/SOURCES.txt new file mode 100644 index 0000000..0c7a2ce --- /dev/null +++ b/edg4llm.egg-info/SOURCES.txt @@ -0,0 +1,35 @@ +LICENSE +README.md +setup.py +edg4llm/__init__.py +edg4llm.egg-info/PKG-INFO +edg4llm.egg-info/SOURCES.txt +edg4llm.egg-info/dependency_links.txt +edg4llm.egg-info/not-zip-safe +edg4llm.egg-info/requires.txt +edg4llm.egg-info/top_level.txt +edg4llm/core/__init__.py +edg4llm/core/dataGenerators.py +edg4llm/core/interface.py +edg4llm/core/pipeline.py +edg4llm/generators/__init__.py +edg4llm/generators/text_generators/__init__.py +edg4llm/generators/text_generators/answer_generator.py +edg4llm/generators/text_generators/base_generator.py +edg4llm/generators/text_generators/dialogue_generator.py +edg4llm/generators/text_generators/question_generator.py +edg4llm/models/__init__.py +edg4llm/models/baseModel.py +edg4llm/models/chatglm.py +edg4llm/models/chatgpt.py +edg4llm/models/deepseek.py +edg4llm/models/internlm.py +edg4llm/processor/__init__.py +edg4llm/processor/postprocess.py +edg4llm/processor/preprocess.py +edg4llm/utils/__init__.py +edg4llm/utils/config.py +edg4llm/utils/data_utils.py +edg4llm/utils/exceptions.py +edg4llm/utils/logger.py +edg4llm/utils/template.py \ No newline at end of file diff --git a/edg4llm.egg-info/dependency_links.txt b/edg4llm.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/edg4llm.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/edg4llm.egg-info/not-zip-safe b/edg4llm.egg-info/not-zip-safe new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/edg4llm.egg-info/not-zip-safe @@ -0,0 +1 @@ + diff --git a/edg4llm.egg-info/requires.txt b/edg4llm.egg-info/requires.txt new file mode 100644 index 0000000..d86a09d --- /dev/null +++ b/edg4llm.egg-info/requires.txt @@ -0,0 +1 @@ +requests>=2.32.3 diff --git a/edg4llm.egg-info/top_level.txt b/edg4llm.egg-info/top_level.txt new file mode 100644 index 0000000..7080f1d --- /dev/null +++ b/edg4llm.egg-info/top_level.txt @@ -0,0 +1 @@ +edg4llm