Merge pull request #12 from LlmKira/cjieba

Cjieba
LlmKira · Sep 6, 2023 · ce73b9b · ce73b9b
2 parents 2d93ef7 + ece4185
commit ce73b9b
Show file tree

Hide file tree

Showing 9 changed files with 30 additions and 50 deletions.
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -8,10 +8,10 @@ services:
       resources:
         limits:
           cpus: '0.80'
-          memory: 990M
+          memory: 1500M
         reservations:
           cpus: '0.25'
-          memory: 550M
+          memory: 500M
     depends_on:
       - redis
       - rabbitmq

diff --git a/middleware/filter/api/keyphrase/keyphrase.py b/middleware/filter/api/keyphrase/keyphrase.py
@@ -3,8 +3,7 @@
 
 import re
 
-import jieba
-import jieba.analyse
+import cjieba
 import numpy as np
 
 from ..solo import singleton
@@ -27,16 +26,18 @@ def __init__(self, topk=50, method='tfidf', with_word=True):
     def cut_sentences(self, text):
         """文本分句，然后分词"""
         sentences = re.findall(".*?[。？！]", text)
-        cut_sentences = [jieba.lcut(sent) for sent in sentences]
+        cut_sentences = [cjieba.lcut(sent) for sent in sentences]
         return cut_sentences
 
     def key_words_extraction(self, text):
         """提取关键词"""
         keywords_score = []
         if self.method == 'tfidf':
-            keywords_score = jieba.analyse.extract_tags(text, topK=self.topk, withWeight=True)
+            # keywords_score = jieba.analyse.extract_tags(text, topK=self.topk, withWeight=True)
+            keywords_score = cjieba.extract(text, top_k=self.topk, with_weight=True)
         elif self.method == 'textrank':
-            keywords_score = jieba.analyse.textrank(text, topK=self.topk, withWeight=True)
+            raise NotImplementedError('textrank method is not implemented')
+            # keywords_score = jieba.analyse.textrank(text, topK=self.topk, withWeight=True)
         return {word: score for word, score in keywords_score}
 
     def key_phrase_extraction(self, text):

diff --git a/middleware/filter/api/keywords/textrank.py b/middleware/filter/api/keywords/textrank.py
diff --git a/middleware/filter/api/keywords/tfidf.py b/middleware/filter/api/keywords/tfidf.py
@@ -1,20 +1,16 @@
 # -*- coding: utf-8 -*-
 
-import jieba
-import jieba.analyse
+import cjieba
 
-from ..keywords import STOPWORDS
 from ..solo import singleton
 
 
 @singleton
 class TfidfKeywords:
     def __init__(self, delete_stopwords=True, topK=20, withWeight=False):
-        if delete_stopwords:
-            jieba.analyse.set_stop_words(STOPWORDS)
-
         self.topk = topK
         self.with_wight = withWeight
 
     def keywords(self, sentence):
-        return jieba.analyse.extract_tags(sentence, topK=self.topk, withWeight=self.with_wight)
+        return cjieba.extract(text=sentence, top_k=self.topk, with_weight=self.with_wight)
+        # return jieba.analyse.extract_tags(sentence, topK=self.topk, withWeight=self.with_wight)
diff --git a/middleware/filter/api/summarization/textrank_summarization.py b/middleware/filter/api/summarization/textrank_summarization.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 
-import jieba
+import cjieba
 import numpy as np
 from nltk.cluster.util import cosine_distance
 
@@ -32,7 +32,7 @@ def split_doc(doc, stopwords=None):
                 break
     for sent in sentences:
         if len(sent) > MIN_SEQ_LEN:
-            cut_sentences.append([word for word in jieba.cut(sent) if word not in stopwords])
+            cut_sentences.append([word for word in cjieba.cut(sent) if word not in stopwords])
             origin_sentences.append(sent)
     return origin_sentences, cut_sentences
 

diff --git a/middleware/filter/api/summarization/tfidf_summarization.py b/middleware/filter/api/summarization/tfidf_summarization.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 
 
-import jieba.analyse
+import cjieba
 
 from ..solo import singleton
 from ..summarization import STOPWORDS
@@ -26,7 +26,8 @@ def split_doc(doc):
 
 
 def calculate_sentence_score(sentence, stopwords):
-    jieba_ret = jieba.analyse.extract_tags(sentence, topK=100, withWeight=True)  # , allowPOS=('ns', 'n', 'vn', 'v'))
+    # jieba_ret = jieba.analyse.extract_tags(sentence, topK=100, withWeight=True)  # , allowPOS=('ns', 'n', 'vn', 'v'))
+    jieba_ret = cjieba.extract(sentence, top_k=100, with_weight=True)
     sentence_score = 0
     for word, score in jieba_ret:
         if word not in stopwords:

diff --git a/middleware/filter/api/text_similarity/cosion.py b/middleware/filter/api/text_similarity/cosion.py
@@ -1,7 +1,7 @@
 # -*- coding: utf-8 -*-
 from typing import Tuple, Union
 
-import jieba
+import cjieba
 from sklearn.metrics.pairwise import cosine_similarity
 
 from ..solo import singleton
@@ -18,24 +18,25 @@ class CosionSimilarity(object):
     def __init__(self):
         self.stopwords = self.load_stopwords(STOPWORDS)
 
-    def load_stopwords(self, stopwords_path):
+    @staticmethod
+    def load_stopwords(stopwords_path):
         with open(stopwords_path, 'r', encoding='utf-8') as f:
-            return [line.strip() for line in f]
+            return set([line.strip() for line in f])
 
-    def cut_words(self, text, stopwords):
-        return [word for word in jieba.cut(text) if word not in stopwords]
+    def cut_words(self, text):
+        return [word for word in cjieba.cut(text) if word not in self.stopwords]
 
     def str_to_vector(self, text1: str, text2: str) -> Tuple[list, list]:
-        text1_words = set(self.cut_words(text1, self.stopwords))
-        text2_words = set(self.cut_words(text2, self.stopwords))
+        text1_words = set(self.cut_words(text1))
+        text2_words = set(self.cut_words(text2))
         all_words = list(text1_words | text2_words)
         text1_vector = [1 if word in text1_words else 0 for word in all_words]
         text2_vector = [1 if word in text2_words else 0 for word in all_words]
         return text1_vector, text2_vector
 
     def similarity(self, text1: Union[str, list], text2: Union[str, list]):
-        text1_words = set(self.cut_words(text1, self.stopwords))
-        text2_words = set(self.cut_words(text2, self.stopwords))
+        text1_words = set(self.cut_words(text1))
+        text2_words = set(self.cut_words(text2))
         all_words = list(text1_words | text2_words)
         text1_vector = [1 if word in text1_words else 0 for word in all_words]
         text2_vector = [1 if word in text2_words else 0 for word in all_words]

diff --git a/middleware/filter/api/text_similarity/simhash.py b/middleware/filter/api/text_similarity/simhash.py
@@ -1,8 +1,6 @@
 # -*- coding: utf-8 -*-
 
-
-import jieba
-import jieba.analyse
+import cjieba
 
 
 def cut_words_weights(content):
@@ -14,7 +12,8 @@ def cut_words_weights(content):
     # jieba提取关键词及其权重
     # 设置停用词
     # jieba.analyse.set_stop_words('path_of_stopwords')
-    tags = jieba.analyse.extract_tags(content, topK=20, withWeight=True)
+    # tags = jieba.analyse.extract_tags(content, topK=20, withWeight=True)
+    tags = cjieba.extract(text=content, top_k=20, with_weight=True)
     tags = [(keyword, int(weight * 10)) for keyword, weight in tags]
     return tags
 

diff --git a/requirements.txt b/requirements.txt
@@ -37,3 +37,4 @@ numpy
 jieba
 fasttext-wheel
 scikit-learn
+cjieba