Skip to content

Commit

Permalink
Merge pull request #27 from Azizadx/misge.fine-tune
Browse files Browse the repository at this point in the history
Misge.fine tune
  • Loading branch information
Azizadx authored Feb 3, 2024
2 parents 062baad + 9060ada commit 9e9378b
Show file tree
Hide file tree
Showing 2 changed files with 7,565 additions and 0 deletions.
58 changes: 58 additions & 0 deletions scripts/tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, processors, decoders
from transformers import PreTrainedTokenizerFast

class CustomTokenizer:
def __init__(self, vocab_size=20_000, min_frequency=2):
self.tokenizer = Tokenizer(models.WordPiece(unk_token='[UNK]'))
self.tokenizer.normalizer = normalizers.Sequence([normalizers.Lowercase(), normalizers.NFKD()])
self.tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
self.trainer = trainers.WordPieceTrainer(
vocab_size=vocab_size,
special_tokens=['[UNK]', '[PAD]', '[CLS]', '[SEP]', '[MASK]'],
min_frequency=min_frequency,
continuing_subword_prefix=''
)
self.processor = processors.TemplateProcessing(
single=f'[CLS]:0 $A:0 [SEP]:0',
pair=f'[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1',
special_tokens=[
('[CLS]', self.tokenizer.token_to_id('[CLS]')),
('[SEP]', self.tokenizer.token_to_id('[SEP]'))
]
)
self.decoder = decoders.WordPiece(prefix='##')
self.transformers_tokenizer = None

def train_tokenizer(self, input_files):
self.tokenizer.train(input_files, trainer=self.trainer)

def train_tokenizer_iterable(self, iterable_data):
self.tokenizer.train_from_iterator(iterable_data, trainer=self.trainer)

def setup_post_processing(self):
cls_id = self.tokenizer.token_to_id('[CLS]')
sep_id = self.tokenizer.token_to_id('[SEP]')
self.tokenizer.post_processor = processors.TemplateProcessing(
single=f'[CLS]:0 $A:0 [SEP]:0',
pair=f'[CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1',
special_tokens=[
('[CLS]', cls_id),
('[SEP]', sep_id)
]
)

def setup_transformers_tokenizer(self):
self.transformers_tokenizer = PreTrainedTokenizerFast(
tokenizer_object=self.tokenizer,
unk_token='[UNK]',
pad_token='[PAD]',
cls_token='[CLS]',
sep_token='[SEP]',
mask_token='[MASK]'
)

def save_transformers_tokenizer(self, save_path):
if self.transformers_tokenizer:
self.transformers_tokenizer.save_pretrained(save_path)
else:
raise ValueError("Transformers tokenizer not set. Run setup_transformers_tokenizer first.")
Loading

0 comments on commit 9e9378b

Please sign in to comment.