Skip to content

Commit

Permalink
增加分布式分词预处理脚本
Browse files Browse the repository at this point in the history
  • Loading branch information
JeremySun1224 authored Mar 28, 2020
1 parent 8df66c3 commit cd9997f
Show file tree
Hide file tree
Showing 3 changed files with 233 additions and 0 deletions.
Binary file not shown.
140 changes: 140 additions & 0 deletions corpus_process/lm_corpus_processor_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
from concurrent.futures import ProcessPoolExecutor
import time
from itertools import chain
import jieba


class Base:
"""
base class
"""
def __init__(self, num_worker):
self.num_worker = num_worker

def _multi_process(self, process_func, iter_list: list) -> list:
with ProcessPoolExecutor(max_workers = self.num_worker) as executor:
result = executor.map(process_func, iter_list)
return list(result)

@staticmethod
def timer(func):
def wrapper(*args, **kwargs):
start = time.time()
ret = func(*args, **kwargs)
end = time.time()
print(f'func \'{func.__name__}\' done in {round(end - start, 3)}s')
return ret
return wrapper


class BasePassageCleaner(Base):
"""
to be override _clean_func
"""

def __init__(self, num_worker):
super(BasePassageCleaner, self).__init__(num_worker)

@Base.timer
def run(self, passage_list: list):
cleaned_passages = self._multi_process(self._clean_func, passage_list)
return cleaned_passages

def _clean_func(self, passage: str) -> str:
cleaned_passage = passage
return cleaned_passage


class BasePassageSplitter(Base):
"""
to be override _split_func
"""

def __init__(self, num_worker):
super(BasePassageSplitter, self).__init__(num_worker)

@Base.timer
def run(self, passage_list: list):
splitted_passages = self._multi_process(self._split_func, passage_list)
splitted_passages = self.reshape(splitted_passages)
return splitted_passages

def reshape(self, splitted_passages: list) -> list:
return list(chain(*splitted_passages))

def _split_func(self, passage: str) -> str:
splitted_passages = passage.split('。')
return splitted_passages


class BaseSentanceCleaner(Base):
"""
to be override _clean_func
"""
def __init__(self, num_worker):
super(BaseSentanceCleaner, self).__init__(num_worker)

@Base.timer
def run(self, sentance_list: list) -> list:
passages = self._multi_process(self._clean_func, sentance_list)
return passages

def _clean_func(self, sentance: str) -> str:
cleaned_sentance = sentance
return cleaned_sentance


class Handler(Base):
"""
the main pipeline
"""
def __init__(self, num_worker, user_dict=None):
super(Handler, self).__init__(num_worker)
self.passage_cleaner = None
self.passage_splitter = None
self.sentance_cleaner = None
if user_dict is not None:
jieba.load_userdict(user_dict)

@Base.timer
def init(self, passage_cleaner, passage_splitter, sentance_cleaner):
self.passage_cleaner = passage_cleaner
self.passage_splitter = passage_splitter
self.sentance_cleaner = sentance_cleaner
print(f'handler initialized')

@Base.timer
def segment(self, cleaned_sentances: list, use_hmm: bool=False) -> list:
jieba.enable_parallel(self.num_worker)
cleaned_sentances = [' '.join(jieba.lcut(i, HMM=use_hmm)) for i in cleaned_sentances]
jieba.disable_parallel()
return cleaned_sentances

@Base.timer
def handle(self, passage_list):
assert self.passage_cleaner is not None
assert self.passage_splitter is not None
assert self.sentance_cleaner is not None
cleaned_passages = self.passage_cleaner.run(passage_list)
splitted_passages = self.passage_splitter.run(cleaned_passages)
cleaned_sentances = self.sentance_cleaner.run(splitted_passages)
cleaned_sentances = self.segment(cleaned_sentances)
return cleaned_sentances

class Segmentor:
def __init__(self, num_worker):
self.num_worker = num_worker

def segment(self, sentance_list: list) -> list:
return segment


if __name__ == '__main__':
passage_list = ['a。b。c。d。f。e','a。b。c。d。f。e','a。b。c。d。f。e']
passage_cleaner = BasePassageCleaner(3)
passage_splitter = BasePassageSplitter(3)
sentance_cleaner = BaseSentanceCleaner(3)
handler = Handler(3)
handler.init(passage_cleaner,passage_splitter,sentance_cleaner)
handler.handle(passage_list)
print(handler.cleaned_sentance)
93 changes: 93 additions & 0 deletions corpus_process/pipeline_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from lm_corpus_processor_base import *

import jieba
import re

class PassageCleaner(BasePassageCleaner):
def __init__(self, num_worker):
super(PassageCleaner, self).__init__(num_worker)

@staticmethod
def remove_html(sentance: str) -> str:
re_tag = re.compile('</?\w+[^>]*>') # HTML标签
new_text = re.sub(re_tag, '', sentance)
new_text = re.sub(",+", ",", new_text) # 合并逗号
new_text = re.sub(" +", " ", new_text) # 合并空格
new_text = re.sub("[...|…|。。。]+", "...", new_text) # 合并句号
new_text = re.sub("-+", "--", new_text) # 合并-
new_text = re.sub("———+", "———", new_text) # 合并-
return new_text

def _clean_func(self, passage):
self.remove_html(passage)
return passage


class PassageSplitter(BasePassageSplitter):
def __init__(self, num_worker):
super(PassageSplitter, self).__init__(num_worker)

def _split_func(self, passage):
passage = re.sub('([;,。!?\?])([^”’])', r"\1\n\2", passage) # 单字符断句符
passage = re.sub('(\.{6})([^”’])', r"\1\n\2", passage) # 英文省略号
passage = re.sub('(\…{2})([^”’])', r"\1\n\2", passage) # 中文省略号
passage = re.sub('([;,。!?\?][”’])([^,。!?\?])', r'\1\n\2', passage)
# 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号
passage = passage.rstrip() # 段尾如果有多余的\n就去掉它
return passage.split("\n")


class SentanceCleaner(BaseSentanceCleaner):
def __init__(self, num_worker, user_dict_file=None):
super(SentanceCleaner, self).__init__(num_worker)

@staticmethod
def remove_other(sentance):
def is_chinese(uchar):
"""判断一个unicode是否是汉字"""
if uchar >= u'\u4e00' and uchar <= u'\u9fa5':
return True
else:
return False

def is_number(uchar):
"""判断一个unicode是否是数字"""
if uchar >= u'\u0030' and uchar <= u'\u0039':
return True
else:
return False

def is_alphabet(uchar):
"""判断一个unicode是否是英文字母"""
if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'):
return True
else:
return False
content_str = ''
for i in sentance:
if is_chinese(i) | is_number(i) | is_alphabet(i):
content_str = content_str+i

return content_str

def _clean_func(self, sentance):
#sentance = sentance
sentance = self.remove_other(sentance)
return sentance


if __name__ == '__main__':
handler = Handler(3)
passage_list = 100*['我爱北京天安门,天安门上太阳升。伟大领袖毛主席,指引我们向前进。','我爱北京天安门,天安门上太阳升。伟大领袖毛主席,指引我们向前进。','我爱北京天安门,天安门上太阳升。伟大领袖毛主席,指引我们向前进。']
pc = PassageCleaner(3)
ps = PassageSplitter(3)
sc = SentanceCleaner(3)
handler.init(pc,ps,sc)
c = handler.handle(passage_list)
print(c[:10])






0 comments on commit cd9997f

Please sign in to comment.