-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathall.py
181 lines (148 loc) · 6.18 KB
/
all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
import re,json
from collections import Counter
'''get corpus of documents'''
def get_dico():
textdir = "data/dico/liste.de.mots.francais.frgut.txt"
try:DICO = open(textdir,'r',encoding="utf-8").read()
except: DICO = open(textdir,'r').read()
textdir = 'data/corpus/corpus.txt'
try:CORPUS = open(textdir,'r',encoding="utf-8").read()
except: CORPUS = open(textdir,'r').read()
#WORDS = Counter(words( 'manger bouger difference update All edits that are one edit away from `word`. The subset of `words` that appear in the dictionary of WORDS '))
return DICO+CORPUS
''' a function to remove diacritics from letters'''
import unicodedata, re, string
def remove_accents(input_str):
'''
nfkd_form = unicodedata.normalize('NFKD', input_str)
only_ascii = nfkd_form.encode('ASCII', 'ignore')
return only_ascii
'''
"""This method removes all diacritic marks from the given string"""
norm_txt = unicodedata.normalize('NFD', input_str)
shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
return unicodedata.normalize('NFC', shaved)
''' a function to clean sentence and return only words'''
def clean_sentence(texte):
# Replace diacritics
texte = remove_accents(texte)
# Lowercase the document
texte = texte.lower()
# Remove Mentions
texte = re.sub(r'@\w+', '', texte)
# Remove punctuations
texte = re.sub(r'[%s]' % re.escape(string.punctuation), ' ', texte)
# Remove the doubled space
texte = re.sub(r'\s{2,}', ' ', texte)
#remove whitespaces at the beginning and the end
texte = texte.strip()
return texte
'''cleaning and tokenization'''
def tokenize_sentence(texte):
#clean the sentence
texte = clean_sentence(texte)
#tokenize
liste_words = texte.split()
#return
return liste_words
'''
#alternatives
def tokenize_sentence_way2(texte):
#retourner les groupes d'alphabets
return re.findall(r'\w+', texte.lower())
def tokenize_sentence_way3(texte):
#clean the sentence
blob_object = TextBlob(texte)
#tokenize
liste_words = blob_object.words
#return
return liste_words
'''
''' remove apostrophe and get only base form of a word'''
def strip_apostrophe(liste_words):
get_radical = lambda word: word.split('\'')[-1]
return list(map(get_radical,liste_words))
''' first text preprocessing. I use methods above get clean words (3 letters minimum) from a sentence'''
def pre_process(sentence):
#remove '_' from the sentence
sentence = sentence.replace('_','')
#get words fro the sentence
liste_words = tokenize_sentence(sentence)
#cut out 1 or 2 letters ones
liste_words = [elt for elt in liste_words if len(elt)>2]
#prendre le radical après l'apostrophe
liste_words = strip_apostrophe(liste_words)
print('\nsentence to words : ',liste_words)
return liste_words
'''words correction in respect of a corpus'''
def edits1(word):
"All edits that are one edit away from `word`."
letters = 'abcdefghijklmnopqrstuvwxyz'
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
deletes = [L + R[1:] for L, R in splits if R]
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
inserts = [L + c + R for L, R in splits for c in letters]
return set(deletes + transposes + replaces + inserts)
def edits2(word):
"All edits that are two edits away from `word`."
return (e2 for e1 in edits1(word) for e2 in edits1(e1))
def known(words):
"The subset of `words` that appear in the dictionary of WORDS."
return set(w for w in words if w in WORDS)
def candidates(word):
"Generate possible spelling corrections for word."
return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])
def DICO_ET_CORRECTEUR():
"cette fonction retourne la liste des mots de dictionnaire"
DICO = get_dico()
WORDS = Counter(pre_process(DICO)) #Counter prends un str et retourne une sorte de liste enrichie
"correction des mots "
N = sum(WORDS.values())
P = lambda word: WORDS[word] / N #"Probability of `word`."
correction = lambda word: max(candidates(word), key=P) #"Most probable
return WORDS,correction
WORDS,CORRECTION = DICO_ET_CORRECTEUR()
'''stopwords'''
with open("data/stop_words/stp_words_.json",'r') as json_file:
STOPWORDS = json.load(json_file) #une liste
STOPWORDS = list(map(remove_accents,STOPWORDS))
''' get lemmatizer'''
with open("data\\lemma_dico\\sample_.json",'r') as json_file:
#json_file.seek(0)
LISTE = json.load(json_file) #un dict cle/val
my_stemmer = lambda word: LISTE[word] if word in LISTE else word
''' Use all fonctionnalities above (preprocessing, correction, stemming) to get correct french words from any french sentence'''
def SENTENCE_TO_CORRECT_WORDS(sentence):
"cette fonction retourne la liste des mots du user"
print('\n------------pre_process--------\n')
liste_words = pre_process(sentence)
print(liste_words)
print('\n------------correction--------\n')
liste_words = list(map(CORRECTION,liste_words))
print(liste_words)
print('\n------------stemming--------\n')
liste_words = list(map(my_stemmer,liste_words))
print(liste_words)
print('\n------------remove stp-words--------\n')
liste_words = [elt for elt in liste_words if elt not in STOPWORDS]
print(liste_words)
print('\n-------------------------------------\n')
return liste_words
''' a test here'''
out = 0
if __name__ == '__main__':
print('\n-------------------------------------\n')
sentence = 'voilà ma phrase'
print('sentence: ',sentence)
liste_words = SENTENCE_TO_CORRECT_WORDS(sentence)
print('liste_words:',liste_words)
print('\n-------------------------------------\n')
print('\ndes phrases à mots raté, d\'une faute ou deux, à corriger')
while out!=2:
sentence = input('sentence or word: ')
if sentence:
#CORRECTION(word.lower())
liste_words = SENTENCE_TO_CORRECT_WORDS(sentence)
#print(liste_words)
else: out +=1