-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathglobals.py
61 lines (48 loc) · 1.89 KB
/
globals.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from glob import glob
from sys import modules
import pandas
from transformers import BertTokenizer
from pathlib import Path
calling_module = Path(modules['__main__'].__file__).stem
TOKENIZER_INFO = '3000_3'
TOKENIZER = f'ocr_tokenizers/{TOKENIZER_INFO}/'
ORIGINAL_FILES = 'data/1m/training/original'
CORRECTED_FILES = 'data/1m/training/corrected'
ORIGINAL_VAL_FILES = 'data/1m/validation/original'
CORRECTED_VAL_FILES = 'data/1m/validation/corrected'
def get_vocab(vocab_file):
with open(vocab_file, 'r') as vocab:
vocabulary = vocab.read().splitlines()
return vocabulary
def read_lines(file, tokenizer):
with open(file, 'r', encoding='utf-8') as infile:
infile = infile.readlines()
for (index, line) in enumerate(infile):
if index+1 < len(infile):
line1 = infile[index].strip()
line2 = infile[index+1].strip()
combined = f'{line1}<newline>{line2}'
else:
combined = line
yield tokenizer(combined)
def read_files(path, tokenizer):
all_files = glob(f'{path}/*')
for file in all_files:
for line in read_lines(file, tokenizer=tokenizer):
yield line
bert_tokenizer = BertTokenizer.from_pretrained(
TOKENIZER,
strip_accents=False,
do_lower_case=False,
clean_text=False,
special_tokens= ['<unk>', '<pad>', '<bos>', '<eos>'],
add_special_tokens=True)
bert_tokenizer.add_special_tokens({'additional_special_tokens': ['<newline>']})
def tokenize(sequence):
tokenized = bert_tokenizer(sequence).input_ids
tokenized = [0 if id is None else id for id in tokenized]
if tokenized != []:
return [wordpiece_vocab[i] for i in tokenized[1:-1]]
return []
wordpiece_vocab = get_vocab(f'{TOKENIZER}/vocab.txt')
OCR_TOKENIZER = tokenize