From d5cb6b3b9af9a6aa9286f92ad852331d65989787 Mon Sep 17 00:00:00 2001 From: yuiseki Date: Thu, 28 Mar 2024 16:03:05 +0900 Subject: [PATCH] =?UTF-8?q?template=E3=83=9F=E3=82=B9=E3=81=A3=E3=81=A6?= =?UTF-8?q?=E3=81=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/dataset/prepare.py | 102 ----------------------------------------- src/train.py | 10 ++-- src/train/tokenizer.py | 72 ----------------------------- 3 files changed, 5 insertions(+), 179 deletions(-) delete mode 100644 src/dataset/prepare.py delete mode 100644 src/train/tokenizer.py diff --git a/src/dataset/prepare.py b/src/dataset/prepare.py deleted file mode 100644 index c04c5ae..0000000 --- a/src/dataset/prepare.py +++ /dev/null @@ -1,102 +0,0 @@ -import os -import time -from pathlib import Path - -from litgpt import HFTokenizer -from litgpt.data.prepare_starcoder import DataChunkRecipe -from litdata.processing.data_processor import DataProcessor - -from datasets.load import load_dataset - -import sys - -# support running without installing as a package -wd = Path(__file__).parent.parent.resolve() -sys.path.append(str(wd)) - -dataset_list = [ - {"id": "wikimedia/wikipedia", "config": "20231101.en"}, - {"id": "wikimedia/wikipedia", "config": "20231101.ja"}, - {"id": "CohereForAI/aya_dataset", "config": "en"}, - {"id": "CohereForAI/aya_dataset", "config": "ja"}, -] - - -def format_number(num): - if abs(num) >= 10**12: # Trillion - return "{:.2f}T".format(num / 10**12) - elif abs(num) >= 10**9: # Billion - return "{:.2f}B".format(num / 10**9) - elif abs(num) >= 10**6: # Million - return "{:.2f}M".format(num / 10**6) - else: - return str(num) - - -class YuisekinAIDataRecipe(DataChunkRecipe): - def __init__(self, tokenizer: HFTokenizer, chunk_size: int): - super().__init__(chunk_size) - self.tokenizer = tokenizer - self.total_token_cnt = 0 - - def prepare_item(self): - for dataset_data in dataset_list: - print("start...", dataset_data["id"], dataset_data["config"]) - dataset_id = dataset_data["id"] - dataset_config = dataset_data["config"] - if dataset_config is not None: - dataset = load_dataset(dataset_id, dataset_config) - else: - dataset = load_dataset(dataset_id) - ds = dataset["train"] - print("ds", ds) - if "aya" in dataset_id: - for v in ds["inputs"]: - text_ids = self.tokenizer.encode(v, bos=False, eos=True) - self.total_token_cnt += len(text_ids) - yield text_ids - else: - for v in ds: - text_ids = self.tokenizer.encode(v["text"], bos=False, eos=True) - self.total_token_cnt += len(text_ids) - yield text_ids - - -def prepare_for_dataset( - tokenizer_path: Path, - destination_path: Path, - chunk_size: int, -) -> None: - destination_path.mkdir(parents=True, exist_ok=True) - - tokenizer = HFTokenizer(tokenizer_path) - data_recipe = YuisekinAIDataRecipe(tokenizer=tokenizer, chunk_size=chunk_size) - data_processor = DataProcessor( - input_dir=None, - output_dir=str(destination_path), - fast_dev_run=True, - num_workers=os.cpu_count(), - num_downloaders=1, - ) - - start_time = time.time() - data_processor.run(data_recipe) - elapsed_time = time.time() - start_time - print(f"Time taken: {elapsed_time:.2f} seconds") - - -def prepare( - destination_path: Path = Path("/data/YuisekinAI_data"), - # 2048 block size + 1 for causal (from LLama), 1024 blocks - chunk_size: int = 2049 * 1024, -) -> None: - tokenizer_path = Path("./tmp/tokenizer.json") - prepare_for_dataset( - tokenizer_path=tokenizer_path, - destination_path=destination_path, - chunk_size=chunk_size, - ) - - -if __name__ == "__main__": - prepare() diff --git a/src/train.py b/src/train.py index 709e49c..09ee104 100644 --- a/src/train.py +++ b/src/train.py @@ -36,7 +36,7 @@ def simple_template_for_pretrain(input) -> str: # inputから、2つ以上連続する改行を除去する input = "\n".join([line for line in input.splitlines() if line.strip() != ""]) template = f"""\ - {input} + {input}\ """ # Remove any leading whitespace characters from each line in the template. template = "\n".join([line.lstrip() for line in template.splitlines()]) @@ -50,7 +50,7 @@ def simple_template_for_train(input, output) -> str: <|im_end|> <|im_start|>assistant {output} - <|im_end|> + <|im_end|>\ """ # Remove any leading whitespace characters from each line in the template. template = "\n".join([line.lstrip() for line in template.splitlines()]) @@ -65,7 +65,7 @@ def hint_template_for_train(hint, question, answer): <|im_end|> <|im_start|>assistant {answer} - <|im_end|> + <|im_end|>\ """ # Remove any leading whitespace characters from each line in the template. template = "\n".join([line.lstrip() for line in template.splitlines()]) @@ -80,7 +80,7 @@ def context_template_for_train(context, question, answer): <|im_end|> <|im_start|>assistant {answer} - <|im_end|> + <|im_end|>\ """ # Remove any leading whitespace characters from each line in the template. template = "\n".join([line.lstrip() for line in template.splitlines()]) @@ -98,7 +98,7 @@ def context_hint_template_for_train(hint, context, question, answer): <|im_end|> <|im_start|>assistant {answer} - <|im_end|> + <|im_end|>\ """ # Remove any leading whitespace characters from each line in the template. template = "\n".join([line.lstrip() for line in template.splitlines()]) diff --git a/src/train/tokenizer.py b/src/train/tokenizer.py deleted file mode 100644 index fced38f..0000000 --- a/src/train/tokenizer.py +++ /dev/null @@ -1,72 +0,0 @@ -# 参考: -# - https://zenn.dev/if001/articles/87bbe893411fa1 -from datasets.arrow_dataset import Dataset -from datasets.load import load_dataset -from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers - -dataset_list = [ - {"id": "CohereForAI/aya_dataset", "config": None, "filter": {"field": "language_code", "value": "eng"}}, - {"id": "CohereForAI/aya_dataset", "config": None, "filter": {"field": "language_code", "value": "jpn"}}, - {"id": "wikimedia/wikipedia", "config": "20231101.en"}, - {"id": "wikimedia/wikipedia", "config": "20231101.ja"}, -] - - -def init_tokenizer(): - tokenizer = Tokenizer(models.Unigram()) - tokenizer.normalizer = normalizers.NFKC() - tokenizer.pre_tokenizer = pre_tokenizers.UnicodeScripts() - tokenizer.decoder = decoders.BPEDecoder() - return tokenizer - - -def train(tokenizer, trainer): - def ds_yielder(): - for dataset_data in dataset_list: - print("start...", dataset_data) - dataset_id = dataset_data["id"] - dataset_config = dataset_data["config"] - if dataset_config is not None: - raw_dataset = load_dataset(dataset_id, dataset_config, split="train") - else: - raw_dataset = load_dataset(dataset_id, split="train") - - if "filter" in dataset_data: - data_df = raw_dataset.to_pandas() - filter_field = dataset_data["filter"]["field"] - filter_value = dataset_data["filter"]["value"] - data_df = data_df[data_df[filter_field] == filter_value] - dataset = Dataset.from_pandas(data_df) - ds = dataset - else: - ds = raw_dataset - print("ds", ds) - if "aya" in dataset_id: - for v in ds["inputs"]: - yield v - else: - for v in ds: - yield v["text"] - - tokenizer.train_from_iterator(ds_yielder(), trainer=trainer) - return tokenizer - - -def main(): - save_path = "./tmp/tokenizer.json" - vocab_size = 32000 - - tokenizer = init_tokenizer() - trainer = trainers.UnigramTrainer( - vocab_size=vocab_size, - show_progress=True, - special_tokens=["", "", "", "", ""], - unk_token="", - ) - tokenizer = train(tokenizer, trainer) - tokenizer.save(save_path) - print(f"save... {save_path}") - - -if __name__ == "__main__": - main()