Skip to content

Commit

Permalink
Merge pull request #12 from LlmKira/cjieba
Browse files Browse the repository at this point in the history
Cjieba
  • Loading branch information
sudoskys authored Sep 6, 2023
2 parents 2d93ef7 + ece4185 commit ce73b9b
Show file tree
Hide file tree
Showing 9 changed files with 30 additions and 50 deletions.
4 changes: 2 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ services:
resources:
limits:
cpus: '0.80'
memory: 990M
memory: 1500M
reservations:
cpus: '0.25'
memory: 550M
memory: 500M
depends_on:
- redis
- rabbitmq
Expand Down
11 changes: 6 additions & 5 deletions middleware/filter/api/keyphrase/keyphrase.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@

import re

import jieba
import jieba.analyse
import cjieba
import numpy as np

from ..solo import singleton
Expand All @@ -27,16 +26,18 @@ def __init__(self, topk=50, method='tfidf', with_word=True):
def cut_sentences(self, text):
"""文本分句,然后分词"""
sentences = re.findall(".*?[。?!]", text)
cut_sentences = [jieba.lcut(sent) for sent in sentences]
cut_sentences = [cjieba.lcut(sent) for sent in sentences]
return cut_sentences

def key_words_extraction(self, text):
"""提取关键词"""
keywords_score = []
if self.method == 'tfidf':
keywords_score = jieba.analyse.extract_tags(text, topK=self.topk, withWeight=True)
# keywords_score = jieba.analyse.extract_tags(text, topK=self.topk, withWeight=True)
keywords_score = cjieba.extract(text, top_k=self.topk, with_weight=True)
elif self.method == 'textrank':
keywords_score = jieba.analyse.textrank(text, topK=self.topk, withWeight=True)
raise NotImplementedError('textrank method is not implemented')
# keywords_score = jieba.analyse.textrank(text, topK=self.topk, withWeight=True)
return {word: score for word, score in keywords_score}

def key_phrase_extraction(self, text):
Expand Down
19 changes: 0 additions & 19 deletions middleware/filter/api/keywords/textrank.py

This file was deleted.

10 changes: 3 additions & 7 deletions middleware/filter/api/keywords/tfidf.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,16 @@
# -*- coding: utf-8 -*-

import jieba
import jieba.analyse
import cjieba

from ..keywords import STOPWORDS
from ..solo import singleton


@singleton
class TfidfKeywords:
def __init__(self, delete_stopwords=True, topK=20, withWeight=False):
if delete_stopwords:
jieba.analyse.set_stop_words(STOPWORDS)

self.topk = topK
self.with_wight = withWeight

def keywords(self, sentence):
return jieba.analyse.extract_tags(sentence, topK=self.topk, withWeight=self.with_wight)
return cjieba.extract(text=sentence, top_k=self.topk, with_weight=self.with_wight)
# return jieba.analyse.extract_tags(sentence, topK=self.topk, withWeight=self.with_wight)
4 changes: 2 additions & 2 deletions middleware/filter/api/summarization/textrank_summarization.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-


import jieba
import cjieba
import numpy as np
from nltk.cluster.util import cosine_distance

Expand Down Expand Up @@ -32,7 +32,7 @@ def split_doc(doc, stopwords=None):
break
for sent in sentences:
if len(sent) > MIN_SEQ_LEN:
cut_sentences.append([word for word in jieba.cut(sent) if word not in stopwords])
cut_sentences.append([word for word in cjieba.cut(sent) if word not in stopwords])
origin_sentences.append(sent)
return origin_sentences, cut_sentences

Expand Down
5 changes: 3 additions & 2 deletions middleware/filter/api/summarization/tfidf_summarization.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-


import jieba.analyse
import cjieba

from ..solo import singleton
from ..summarization import STOPWORDS
Expand All @@ -26,7 +26,8 @@ def split_doc(doc):


def calculate_sentence_score(sentence, stopwords):
jieba_ret = jieba.analyse.extract_tags(sentence, topK=100, withWeight=True) # , allowPOS=('ns', 'n', 'vn', 'v'))
# jieba_ret = jieba.analyse.extract_tags(sentence, topK=100, withWeight=True) # , allowPOS=('ns', 'n', 'vn', 'v'))
jieba_ret = cjieba.extract(sentence, top_k=100, with_weight=True)
sentence_score = 0
for word, score in jieba_ret:
if word not in stopwords:
Expand Down
19 changes: 10 additions & 9 deletions middleware/filter/api/text_similarity/cosion.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# -*- coding: utf-8 -*-
from typing import Tuple, Union

import jieba
import cjieba
from sklearn.metrics.pairwise import cosine_similarity

from ..solo import singleton
Expand All @@ -18,24 +18,25 @@ class CosionSimilarity(object):
def __init__(self):
self.stopwords = self.load_stopwords(STOPWORDS)

def load_stopwords(self, stopwords_path):
@staticmethod
def load_stopwords(stopwords_path):
with open(stopwords_path, 'r', encoding='utf-8') as f:
return [line.strip() for line in f]
return set([line.strip() for line in f])

def cut_words(self, text, stopwords):
return [word for word in jieba.cut(text) if word not in stopwords]
def cut_words(self, text):
return [word for word in cjieba.cut(text) if word not in self.stopwords]

def str_to_vector(self, text1: str, text2: str) -> Tuple[list, list]:
text1_words = set(self.cut_words(text1, self.stopwords))
text2_words = set(self.cut_words(text2, self.stopwords))
text1_words = set(self.cut_words(text1))
text2_words = set(self.cut_words(text2))
all_words = list(text1_words | text2_words)
text1_vector = [1 if word in text1_words else 0 for word in all_words]
text2_vector = [1 if word in text2_words else 0 for word in all_words]
return text1_vector, text2_vector

def similarity(self, text1: Union[str, list], text2: Union[str, list]):
text1_words = set(self.cut_words(text1, self.stopwords))
text2_words = set(self.cut_words(text2, self.stopwords))
text1_words = set(self.cut_words(text1))
text2_words = set(self.cut_words(text2))
all_words = list(text1_words | text2_words)
text1_vector = [1 if word in text1_words else 0 for word in all_words]
text2_vector = [1 if word in text2_words else 0 for word in all_words]
Expand Down
7 changes: 3 additions & 4 deletions middleware/filter/api/text_similarity/simhash.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# -*- coding: utf-8 -*-


import jieba
import jieba.analyse
import cjieba


def cut_words_weights(content):
Expand All @@ -14,7 +12,8 @@ def cut_words_weights(content):
# jieba提取关键词及其权重
# 设置停用词
# jieba.analyse.set_stop_words('path_of_stopwords')
tags = jieba.analyse.extract_tags(content, topK=20, withWeight=True)
# tags = jieba.analyse.extract_tags(content, topK=20, withWeight=True)
tags = cjieba.extract(text=content, top_k=20, with_weight=True)
tags = [(keyword, int(weight * 10)) for keyword, weight in tags]
return tags

Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,4 @@ numpy
jieba
fasttext-wheel
scikit-learn
cjieba

0 comments on commit ce73b9b

Please sign in to comment.