-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
8df66c3
commit cd9997f
Showing
3 changed files
with
233 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
from concurrent.futures import ProcessPoolExecutor | ||
import time | ||
from itertools import chain | ||
import jieba | ||
|
||
|
||
class Base: | ||
""" | ||
base class | ||
""" | ||
def __init__(self, num_worker): | ||
self.num_worker = num_worker | ||
|
||
def _multi_process(self, process_func, iter_list: list) -> list: | ||
with ProcessPoolExecutor(max_workers = self.num_worker) as executor: | ||
result = executor.map(process_func, iter_list) | ||
return list(result) | ||
|
||
@staticmethod | ||
def timer(func): | ||
def wrapper(*args, **kwargs): | ||
start = time.time() | ||
ret = func(*args, **kwargs) | ||
end = time.time() | ||
print(f'func \'{func.__name__}\' done in {round(end - start, 3)}s') | ||
return ret | ||
return wrapper | ||
|
||
|
||
class BasePassageCleaner(Base): | ||
""" | ||
to be override _clean_func | ||
""" | ||
|
||
def __init__(self, num_worker): | ||
super(BasePassageCleaner, self).__init__(num_worker) | ||
|
||
@Base.timer | ||
def run(self, passage_list: list): | ||
cleaned_passages = self._multi_process(self._clean_func, passage_list) | ||
return cleaned_passages | ||
|
||
def _clean_func(self, passage: str) -> str: | ||
cleaned_passage = passage | ||
return cleaned_passage | ||
|
||
|
||
class BasePassageSplitter(Base): | ||
""" | ||
to be override _split_func | ||
""" | ||
|
||
def __init__(self, num_worker): | ||
super(BasePassageSplitter, self).__init__(num_worker) | ||
|
||
@Base.timer | ||
def run(self, passage_list: list): | ||
splitted_passages = self._multi_process(self._split_func, passage_list) | ||
splitted_passages = self.reshape(splitted_passages) | ||
return splitted_passages | ||
|
||
def reshape(self, splitted_passages: list) -> list: | ||
return list(chain(*splitted_passages)) | ||
|
||
def _split_func(self, passage: str) -> str: | ||
splitted_passages = passage.split('。') | ||
return splitted_passages | ||
|
||
|
||
class BaseSentanceCleaner(Base): | ||
""" | ||
to be override _clean_func | ||
""" | ||
def __init__(self, num_worker): | ||
super(BaseSentanceCleaner, self).__init__(num_worker) | ||
|
||
@Base.timer | ||
def run(self, sentance_list: list) -> list: | ||
passages = self._multi_process(self._clean_func, sentance_list) | ||
return passages | ||
|
||
def _clean_func(self, sentance: str) -> str: | ||
cleaned_sentance = sentance | ||
return cleaned_sentance | ||
|
||
|
||
class Handler(Base): | ||
""" | ||
the main pipeline | ||
""" | ||
def __init__(self, num_worker, user_dict=None): | ||
super(Handler, self).__init__(num_worker) | ||
self.passage_cleaner = None | ||
self.passage_splitter = None | ||
self.sentance_cleaner = None | ||
if user_dict is not None: | ||
jieba.load_userdict(user_dict) | ||
|
||
@Base.timer | ||
def init(self, passage_cleaner, passage_splitter, sentance_cleaner): | ||
self.passage_cleaner = passage_cleaner | ||
self.passage_splitter = passage_splitter | ||
self.sentance_cleaner = sentance_cleaner | ||
print(f'handler initialized') | ||
|
||
@Base.timer | ||
def segment(self, cleaned_sentances: list, use_hmm: bool=False) -> list: | ||
jieba.enable_parallel(self.num_worker) | ||
cleaned_sentances = [' '.join(jieba.lcut(i, HMM=use_hmm)) for i in cleaned_sentances] | ||
jieba.disable_parallel() | ||
return cleaned_sentances | ||
|
||
@Base.timer | ||
def handle(self, passage_list): | ||
assert self.passage_cleaner is not None | ||
assert self.passage_splitter is not None | ||
assert self.sentance_cleaner is not None | ||
cleaned_passages = self.passage_cleaner.run(passage_list) | ||
splitted_passages = self.passage_splitter.run(cleaned_passages) | ||
cleaned_sentances = self.sentance_cleaner.run(splitted_passages) | ||
cleaned_sentances = self.segment(cleaned_sentances) | ||
return cleaned_sentances | ||
|
||
class Segmentor: | ||
def __init__(self, num_worker): | ||
self.num_worker = num_worker | ||
|
||
def segment(self, sentance_list: list) -> list: | ||
return segment | ||
|
||
|
||
if __name__ == '__main__': | ||
passage_list = ['a。b。c。d。f。e','a。b。c。d。f。e','a。b。c。d。f。e'] | ||
passage_cleaner = BasePassageCleaner(3) | ||
passage_splitter = BasePassageSplitter(3) | ||
sentance_cleaner = BaseSentanceCleaner(3) | ||
handler = Handler(3) | ||
handler.init(passage_cleaner,passage_splitter,sentance_cleaner) | ||
handler.handle(passage_list) | ||
print(handler.cleaned_sentance) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
from lm_corpus_processor_base import * | ||
|
||
import jieba | ||
import re | ||
|
||
class PassageCleaner(BasePassageCleaner): | ||
def __init__(self, num_worker): | ||
super(PassageCleaner, self).__init__(num_worker) | ||
|
||
@staticmethod | ||
def remove_html(sentance: str) -> str: | ||
re_tag = re.compile('</?\w+[^>]*>') # HTML标签 | ||
new_text = re.sub(re_tag, '', sentance) | ||
new_text = re.sub(",+", ",", new_text) # 合并逗号 | ||
new_text = re.sub(" +", " ", new_text) # 合并空格 | ||
new_text = re.sub("[...|…|。。。]+", "...", new_text) # 合并句号 | ||
new_text = re.sub("-+", "--", new_text) # 合并- | ||
new_text = re.sub("———+", "———", new_text) # 合并- | ||
return new_text | ||
|
||
def _clean_func(self, passage): | ||
self.remove_html(passage) | ||
return passage | ||
|
||
|
||
class PassageSplitter(BasePassageSplitter): | ||
def __init__(self, num_worker): | ||
super(PassageSplitter, self).__init__(num_worker) | ||
|
||
def _split_func(self, passage): | ||
passage = re.sub('([;,。!?\?])([^”’])', r"\1\n\2", passage) # 单字符断句符 | ||
passage = re.sub('(\.{6})([^”’])', r"\1\n\2", passage) # 英文省略号 | ||
passage = re.sub('(\…{2})([^”’])', r"\1\n\2", passage) # 中文省略号 | ||
passage = re.sub('([;,。!?\?][”’])([^,。!?\?])', r'\1\n\2', passage) | ||
# 如果双引号前有终止符,那么双引号才是句子的终点,把分句符\n放到双引号后,注意前面的几句都小心保留了双引号 | ||
passage = passage.rstrip() # 段尾如果有多余的\n就去掉它 | ||
return passage.split("\n") | ||
|
||
|
||
class SentanceCleaner(BaseSentanceCleaner): | ||
def __init__(self, num_worker, user_dict_file=None): | ||
super(SentanceCleaner, self).__init__(num_worker) | ||
|
||
@staticmethod | ||
def remove_other(sentance): | ||
def is_chinese(uchar): | ||
"""判断一个unicode是否是汉字""" | ||
if uchar >= u'\u4e00' and uchar <= u'\u9fa5': | ||
return True | ||
else: | ||
return False | ||
|
||
def is_number(uchar): | ||
"""判断一个unicode是否是数字""" | ||
if uchar >= u'\u0030' and uchar <= u'\u0039': | ||
return True | ||
else: | ||
return False | ||
|
||
def is_alphabet(uchar): | ||
"""判断一个unicode是否是英文字母""" | ||
if (uchar >= u'\u0041' and uchar <= u'\u005a') or (uchar >= u'\u0061' and uchar <= u'\u007a'): | ||
return True | ||
else: | ||
return False | ||
content_str = '' | ||
for i in sentance: | ||
if is_chinese(i) | is_number(i) | is_alphabet(i): | ||
content_str = content_str+i | ||
|
||
return content_str | ||
|
||
def _clean_func(self, sentance): | ||
#sentance = sentance | ||
sentance = self.remove_other(sentance) | ||
return sentance | ||
|
||
|
||
if __name__ == '__main__': | ||
handler = Handler(3) | ||
passage_list = 100*['我爱北京天安门,天安门上太阳升。伟大领袖毛主席,指引我们向前进。','我爱北京天安门,天安门上太阳升。伟大领袖毛主席,指引我们向前进。','我爱北京天安门,天安门上太阳升。伟大领袖毛主席,指引我们向前进。'] | ||
pc = PassageCleaner(3) | ||
ps = PassageSplitter(3) | ||
sc = SentanceCleaner(3) | ||
handler.init(pc,ps,sc) | ||
c = handler.handle(passage_list) | ||
print(c[:10]) | ||
|
||
|
||
|
||
|
||
|
||
|