-
Notifications
You must be signed in to change notification settings - Fork 8
术语提取模型训练流程
MaoXianXin edited this page Jan 20, 2022
·
2 revisions
链接:https://pan.baidu.com/s/1R3KrzSf8jAnf_QZSz6YfGQ 提取码:mhoh
用到的数据 cs.AI.csv 来上方的百度网盘下载
第一步是把包含某个关键词的句子筛选出来
import spacy
import pandas as pd
import json
from itertools import groupby
# Download spaCy models:
models = {
'en_core_web_sm': spacy.load("en_core_web_sm"),
'en_core_web_lg': spacy.load("en_core_web_lg")
}
# This function converts spaCy docs to the list of named entity spans in Label Studio compatible JSON format:
def doc_to_spans(doc):
tokens = [(tok.text, tok.idx, tok.ent_type_) for tok in doc]
results = []
entities = set()
for entity, group in groupby(tokens, key=lambda t: t[-1]):
if not entity:
continue
group = list(group)
_, start, _ = group[0]
word, last, _ = group[-1]
text = ' '.join(item[0] for item in group)
end = last + len(word)
results.append({
'from_name': 'label',
'to_name': 'text',
'type': 'labels',
'value': {
'start': start,
'end': end,
'text': text,
'labels': [entity]
}
})
entities.add(entity)
return results, entities
# Now load the dataset and include only lines containing "regression ":
df = pd.read_csv('/home/csdn/Downloads/Tensorflow_tutorial/spaCy/domain/cs.AI.csv')
df = df[df['abstract'].str.contains("regression ", na=False)]
print(df.head())
texts = df['abstract']
# Prepare Label Studio tasks in import JSON format with the model predictions:
entities = set()
tasks = []
for text in texts:
predictions = []
for model_name, nlp in models.items():
doc = nlp(text)
spans, ents = doc_to_spans(doc)
entities |= ents
predictions.append({'model_version': model_name, 'result': spans})
tasks.append({
'data': {'text': text},
'predictions': predictions
})
# Save Label Studio tasks.json
print(f'Save {len(tasks)} tasks to "tasks.json"')
with open('tasks.json', mode='w') as f:
json.dump(tasks, f, indent=2)
# Save class labels as a txt file
print('Named entities are saved to "named_entities.txt"')
with open('named_entities.txt', mode='w') as f:
f.write('\n'.join(sorted(entities)))
筛选出来之后,把 tasks.json 导入 Label Studio,然后对属于技术名词的短语进行标注 TECH
在导入之前,我们需要先启动 Label Studio
cd ~/Downloads/Github_repo/Custom_NER_Spacy3
sudo docker run -it -p 8080:8080 -v `pwd`/mydata:/label-studio/data heartexlabs/label-studio:latest
然后导出 .conll 格式,再转换成 .spacy 格式
导出 .conll 格式之后,需要替换导出文件第一行如下
-DOCSTART- -X- O O
spacy convert ./corpus/tech_term.conll -c conll ./corpus/
接下来初始化 base_config.cfg,内容填充如下
# This is an auto-generated partial config. To use it with 'spacy train'
# you can run spacy init fill-config to auto-fill all default settings:
# python -m spacy init fill-config ./base_config.cfg ./config.cfg
[paths]
train = null
dev = null
[system]
gpu_allocator = null
[nlp]
lang = "en"
pipeline = ["tok2vec","ner"]
batch_size = 1000
[components]
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.Tok2Vec.v2"
[components.tok2vec.model.embed]
@architectures = "spacy.MultiHashEmbed.v2"
width = ${components.tok2vec.model.encode.width}
attrs = ["ORTH", "SHAPE"]
rows = [5000, 2500]
include_static_vectors = false
[components.tok2vec.model.encode]
@architectures = "spacy.MaxoutWindowEncoder.v2"
width = 96
depth = 4
window_size = 1
maxout_pieces = 3
[components.ner]
factory = "ner"
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null
[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
[corpora]
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
[training]
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
[training.optimizer]
@optimizers = "Adam.v1"
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
discard_oversize = false
tolerance = 0.2
[training.batcher.size]
@schedules = "compounding.v1"
start = 100
stop = 1000
compound = 1.001
[initialize]
vectors = ${paths.vectors}
再执行下面的命令对配置文件进行补全操作
python -m spacy init fill-config ./base_config.cfg ./config.cfg
最后开始 NER 模型训练
python -m spacy train config.cfg --output ./output --paths.train ./corpus/tech_term.spacy --paths.dev ./corpus/tech_term.spacy --gpu-id 0
对训练好的模型进行预测
import spacy
from spacy import displacy
import pandas as pd
df = pd.read_csv('/home/csdn/Downloads/Tensorflow_tutorial/spaCy/domain/cs.AI.csv')
df = df[df['abstract'].str.contains("regression ", na=False)]
print(df.head())
text = df['abstract'][4].replace('\n', ' ').strip()
nlp = spacy.load("./output/model-best")
doc = nlp(text)
displacy.serve(doc, style="ent")