-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
218 lines (192 loc) · 10.4 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import os
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webscraping import parse_another_site, search
from simple_translator import translateEnglish
from sentence_segmentation import display_separated
from understandability_algorithm import Understandability
"""setup selenium"""
print('setting up selenium')
#os.environ['PATH'] += r';C:\Users\chenz\Documents\GitHub\East-AI-Unihacks-Team-2\chromewebdriver'
os.environ['PATH'] += r';D:/Selenium_webautomation_drivers'
options = Options()
options.headless = False
options.add_argument("--log-level=3")
driver = webdriver.Chrome(options=options)
print('finished setting up selenium')
input('buffer')
#import PySimpleGUI as pg
#"""setup gui"""
#window = sg.Window(title="Hello World", layout=[[]], margins=(100, 50)).read()
"""
examples, googlesearch querys:
breakfast, the first response brings us to a good connection with japan '朝ご飯', https://www.kurashiru.com/lists/d5d8b53c-5cf2-4c4b-b623-9f95ca0666ab
the problem is that curated information souces, like geeksforgeeks often lack detailed or up to date information.
english test query = lightning,
Alt: 朝ごはん,
Result: query: 朝ご飯, sentence: 1000人が絶賛の朝ご飯レシピ, trans: Breakfast recipe acclaimed by 1000 people
We may be able to reroute searches to japanese reddit
Interesting Results Archive:
行き先: 【合唱曲】行き先 / 歌詞付き: [Chorus] Destination / with lyrics: Youtube Video
めいわくでんわ: 迷惑電話ストップサービス: Prank call stop service: Japan has a scam service stop service? America does not have this.
"""
def weblang(query_origin, language = 'de', target_sentences = 10):
input('buffer')
language = 'de'
# query_origin = "Mann kommt"
query = '"'+query_origin+'"'
# target_sentences = 5
target_understandability = 1.75
#query = query_origin
print('ai training start')
#response = search(query, tld='co.in', num = 10, stop = 10, pause = 2)
understandability_algorithm = Understandability('data_to_train.csv', debug = False)
understandability_algorithm.train()
print('ai training_done')
#print('Class testing:', understandability_algorithm.predict("vocabs are ontime and dazzling and fantastic."))
#quit()
response = search(query, pause = 2, num = 30, stop = 30, lang = language)
#response = search(query, tld='co.in', pause = 2)
with open('storage.csv', 'w', encoding='utf8') as f:
good_sentences = 0
output_sentences = []
data_sentences = []
for x in range(30):
if good_sentences >= target_sentences:
break
print(x)
sentences, url = parse_another_site(response, driver, f, query_origin)
try:
parse_limit = int(input('how many sentences to review: '))
mode = input('improvement mode, y/n: ')
except:
print('skipping this website')
parse_limit = 0
for x, sentence in enumerate(sentences):
if x >= parse_limit:
break
if mode == 'y':
print('human guidance mode')
print('|uncovered sentence:', sentence, '\n|url', url )
understandability = float(input('understandability of this sentence: '))
data_sentences.append((sentence, understandability))
if abs(understandability-target_understandability) < .5:
output_sentences.append((sentence, url))
good_sentences += 1
else:
print('automatic mode')
output_sentences.append((sentence, url))
good_sentences += 1
api_broken = True
if api_broken == False:
converted_sentence, converted_sentence_pronounciation = translateEnglish(sentence)
# with open('storage.txt', 'a', encoding='utf8') as g:
# g.writelines('|original', sentence, '\n|translated', converted_sentence, '\n|pronounciation', converted_sentence_pronounciation, '\n')
# pass
if converted_sentence_pronounciation == None:
score = understandability_algorithm.predict(converted_sentence)
elif converted_sentence_pronounciation != None:
score = understandability_algorithm.predict(converted_sentence_pronounciation)
if score == 1:
output_sentences.append([sentence, converted_sentence, converted_sentence_pronounciation, url, score])
good_sentences += 1
else:
score = understandability_algorithm.predict(sentence)
print(score)
if score == 1:
output_sentences.append([sentence, converted_sentence, converted_sentence_pronounciation, url, score])
good_sentences += 1
print('finished')
#converted_sentence, converted_sentence_pronounciation = translate_text('en', sentence)
try:
with open('output.txt', 'w', encoding='utf8') as f:
f.write('\n\n')
f.write('\n'.join([str(i) for i in output_sentences]))
checkout = output_sentences[0][1]
checkout_sentence = output_sentences[0][0]
driver.get(checkout)
# import pyperclip
# pyperclip.copy(checkout_sentence)
# spam = pyperclip.paste()
#print(spam)
if api_broken == False:
converted_sentence, converted_sentence_pronounciation = translateEnglish(sentence)
print('\n|original', sentence, '\n|translated', converted_sentence, '\n|pronounciation', converted_sentence_pronounciation, '\n|url', url, '\n|comprehension level', score)
display_separated(converted_sentence_pronounciation, 'en')
except IndexError:
print('query busted, do not include underlines or special formats, query must exist in website text exactly')
input('close webdriver? ')
driver.quit()
if input('update ml algorithm with new information from this study session? y/n: ') == 'n':
print('Exiting')
else:
understandability_algorithm.update(data_sentences)
def simple_weblang(query_origin = "der alte", language = 'de', max_websites_to_scan = 30, target_sentences = 3, understandability_target = .3, parse_limit = 300):
query = '"'+query_origin+'"'
#target_understandability = 1.75
print('ai training start, new session started')
#response = search(query, tld='co.in', num = 10, stop = 10, pause = 2)
understandability_algorithm = Understandability('data_to_train.csv', debug = False)
understandability_algorithm.train()
print('ai training_done')
#print('Class testing:', understandability_algorithm.predict("vocabs are ontime and dazzling and fantastic."))
#quit()
response = search(query, pause = 2, num = max_websites_to_scan, stop = max_websites_to_scan, lang = language)
#response = search(query, tld='co.in', pause = 2)
with open('storage.csv', 'w', encoding='utf8') as file:
good_sentences = 0
output_sentences = []
data_sentences = []
for x in range(1,30):
if good_sentences >= target_sentences:
break
print('\nwebsite number', x)
sentences, url = parse_another_site(response, driver, file, query_origin)
for x, sentence in enumerate(sentences):
if x >= parse_limit:
print('breaking and current x', x, 'limit', parse_limit)
break
api_broken = True
if api_broken == False:
converted_sentence, converted_sentence_pronounciation = translateEnglish(sentence)
# with open('storage.txt', 'a', encoding='utf8') as g:
# g.writelines('|original', sentence, '\n|translated', converted_sentence, '\n|pronounciation', converted_sentence_pronounciation, '\n')
# pass
if converted_sentence_pronounciation == None:
score = understandability_algorithm.predict(converted_sentence)
elif converted_sentence_pronounciation != None:
score = understandability_algorithm.predict(converted_sentence_pronounciation)
if score == 1:
output_sentences.append([sentence, converted_sentence, converted_sentence_pronounciation, url, score])
good_sentences += 1
else:
score = understandability_algorithm.predict(sentence)
print(score)
if score > understandability_target:
output_sentences.append([sentence, url, score])
good_sentences += 1
print('finished')
#converted_sentence, converted_sentence_pronounciation = translate_text('en', sentence)
try:
with open('output.txt', 'w', encoding='utf8') as f:
f.write('\n\n')
f.write('\n'.join([str(i) for i in output_sentences]))
checkout = output_sentences[0][1]
checkout_sentence = output_sentences[0][0]
driver.get(checkout)
# import pyperclip
# pyperclip.copy(checkout_sentence)
# spam = pyperclip.paste()
#print(spam)
if api_broken == False:
converted_sentence, converted_sentence_pronounciation = translateEnglish(sentence)
print('\n|original', sentence, '\n|translated', converted_sentence, '\n|pronounciation', converted_sentence_pronounciation, '\n|url', url, '\n|comprehension level', score)
display_separated(converted_sentence_pronounciation, 'en')
except IndexError:
print('query busted, do not include underlines or special formats, query must exist in website text exactly')
driver.quit()
understandability_algorithm.update(data_sentences)
print('returning output')
return output_sentences
#weblang('行き先')
print('\n\n\n',"\n".join([str(i) for i in simple_weblang(query_origin = 'lightningstorm', language = 'en', understandability_target = .3)]), sep = '')