Skip to content

Commit

Permalink
Add src/train/tokenizer.py
Browse files Browse the repository at this point in the history
  • Loading branch information
yuiseki committed Mar 28, 2024
1 parent dc16faf commit b9dc33a
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
!output/.gitkeep
output
wandb
tmp
/.venv
/.mypy_cache
82 changes: 82 additions & 0 deletions src/dataset/prepare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import lit_llama.packed_dataset as packed_dataset
from lit_llama import Tokenizer, HFTokenizer
from datasets import load_dataset
import numpy as np

from pathlib import Path
import sys

# support running without installing as a package
wd = Path(__file__).parent.parent.resolve()
sys.path.append(str(wd))

sample_ids = ["izumi-lab/wikinews-ja-20230728", "izumi-lab/wikinews-en-20230728", "if001/aozorabunko-clean-sin"]


def format_number(num):
if abs(num) >= 10**12: # Trillion
return "{:.2f}T".format(num / 10**12)
elif abs(num) >= 10**9: # Billion
return "{:.2f}B".format(num / 10**9)
elif abs(num) >= 10**6: # Million
return "{:.2f}M".format(num / 10**6)
else:
return str(num)


def prepare_for_dataset(
dataset_ids: list[str],
tokenizer_path: Path,
destination_path: Path,
chunk_size: int,
) -> None:
destination_path.mkdir(parents=True, exist_ok=True)
# tokenizer = Tokenizer(tokenizer_path)
tokenizer = HFTokenizer(model_path=tokenizer_path)
total_token_cnt = 0
for dataset_id in dataset_ids:
token_cnt = 0
print(f"Processing {dataset_ids}")
prefix = dataset_id.split("/")[-1]
builder = packed_dataset.PackedDatasetBuilder(
outdir=destination_path,
prefix=prefix,
chunk_size=chunk_size,
sep_token=tokenizer.bos_id,
dtype="auto",
vocab_size=tokenizer.vocab_size,
)
ds = load_dataset(dataset_id)
ds = ds["train"]

if "aozora" in dataset_id:
for v in ds["text"]:
text_ids = tokenizer.encode(v)
token_cnt += len(text_ids)
builder.add_array(np.array(text_ids, dtype=builder.dtype))
else:
for v in ds:
text_ids = tokenizer.encode(v["text"])
token_cnt += len(text_ids)
builder.add_array(np.array(text_ids, dtype=builder.dtype))
builder.write_reminder()
print("tokens ", format_number(token_cnt))
total_token_cnt += token_cnt
print("total tokens", format_number(total_token_cnt))


def prepare(
destination_path: Path = Path("/data/YuisekinAI_data"),
# 2048 block size + 1 for causal (from LLama), 1024 blocks
chunk_size: int = 2049 * 1024,
) -> None:
prepare_for_dataset(
dataset_ids=dataset_ids,
tokenizer_path=tokenizer_path,
destination_path=destination_path,
chunk_size=chunk_size,
)


if __name__ == "__main__":
prepare()
62 changes: 62 additions & 0 deletions src/train/tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# https://zenn.dev/if001/articles/87bbe893411fa1
import datasets
from tokenizers import Tokenizer, decoders, models, normalizers, pre_tokenizers, trainers

dataset_list = [
{"id": "wikimedia/wikipedia", "config": "20231101.en"},
{"id": "wikimedia/wikipedia", "config": "20231101.ja"},
{"id": "CohereForAI/aya_dataset", "config": "en"},
{"id": "CohereForAI/aya_dataset", "config": "ja"},
]


def init_tokenizer():
tokenizer = Tokenizer(models.Unigram())
tokenizer.normalizer = normalizers.NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.UnicodeScripts()
tokenizer.decoder = decoders.BPEDecoder()
return tokenizer


def train(tokenizer, trainer):
def ds_yielder():
for dataset_data in dataset_list:
print("start...", dataset_data["id"], dataset_data["config"])
dataset_id = dataset_data["id"]
dataset_config = dataset_data["config"]
if dataset_config is not None:
dataset = datasets.load_dataset(dataset_id, dataset_config)
else:
dataset = datasets.load_dataset(dataset_id)
ds = dataset["train"]
print("ds", ds)
# ds = ds.select(range(0, 100))
if "aya" in dataset_id:
for v in ds["inputs"]:
yield v
else:
for v in ds:
yield v["text"]

tokenizer.train_from_iterator(ds_yielder(), trainer=trainer)
return tokenizer


def main():
save_path = "./tmp/tokenizer.json"
vocab_size = 32000

tokenizer = init_tokenizer()
trainer = trainers.UnigramTrainer(
vocab_size=vocab_size,
show_progress=True,
special_tokens=["<PAD>", "<BOS>", "<EOS>", "<UNK>", "<MASK>"],
unk_token="<UNK>",
)
tokenizer = train(tokenizer, trainer)
tokenizer.save(save_path)
print(f"save... {save_path}")


if __name__ == "__main__":
main()

0 comments on commit b9dc33a

Please sign in to comment.