-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathun_ner_tokens.py
24 lines (20 loc) · 1.04 KB
/
un_ner_tokens.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import pandas as pd
import itertools
import os
from datasets import Dataset
from datasets import load_dataset
def get_tokens_and_ner_tags(filename):
with open(filename, 'r', encoding="utf8") as f:
lines = f.readlines()
split_list = [list(y) for x, y in itertools.groupby(lines, lambda z: z == '\n') if not x]
tokens = [[x.split('\t')[0] for x in y] for y in split_list]
entities = [[x.split('\t')[1][:-1] for x in y] for y in split_list]
return pd.DataFrame({'tokens': tokens, 'ner_tags': entities})
def get_all_tokens_and_ner_tags(directory):
return pd.concat([get_tokens_and_ner_tags(os.path.join(directory, filename)) for filename in os.listdir(directory)]).reset_index().drop('index', axis=1)
def get_un_token_dataset(train_directory, test_directory):
train_df = get_all_tokens_and_ner_tags(train_directory)
test_df = get_all_tokens_and_ner_tags(test_directory)
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)
return (train_dataset, test_dataset)