Skip to content

Commit

Permalink
Little reconstruction of the pipeline of paraphrasing
Browse files Browse the repository at this point in the history
  • Loading branch information
BaiBlanc committed Jun 14, 2020
1 parent d528f4c commit 1b6a50e
Show file tree
Hide file tree
Showing 7 changed files with 57 additions and 48 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,14 @@
import json
import sys
import urllib
from urllib2 import urlopen
import argparse
from bs4 import BeautifulSoup


def get_url(url):
"""Fuction to extract the http://mappings.dbpedia.org/server/ontology/classes/<some entity>
page link for the given http://mappings.dbpedia.org/index.php/OntologyClass:<some entity>"""
try: #python3
page = urllib.request.urlopen(url)
except: #python2
page = urlopen(url)
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "html.parser")
link = soup.findAll('a', attrs={"rel": "nofollow"})[0]['href']
return link
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import urllib
from urllib2 import urlopen
import json
import sys
import csv
Expand All @@ -18,10 +17,7 @@ class related information and data types as field values in each row.
- This function also returns a 2D list of the information mentioned above to the calling
function
"""
try: # python3
page = urllib.request.urlopen(url)
except: # python2
page = urlopen(url)
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "html.parser")
if(not os.path.isdir(project_name)):
os.makedirs(project_name)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from generate_url import generate_url_spec, generate_url
from get_properties import get_properties
import urllib
from urllib2 import urlopen
# from urllib2 import urlopen
import urllib.parse
from bs4 import BeautifulSoup
import os
Expand All @@ -29,10 +29,10 @@ def rank_check(query, diction, count, original_count):
# url = "https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query="+query + \
# "&format=application%2Fsparql-results%2Bjson&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+"
# print(url)
try: # python3
page = urllib.request.urlopen(url)
except: # python2
page = urlopen(url)
# try: # python3
# page = urllib.request.urlopen(url)
# except: # python2
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "html.parser")
total = len(soup.find_all("tr"))
accum = 0
Expand Down Expand Up @@ -66,7 +66,7 @@ def check_query(log, query):
try: # python3
page = urllib.request.urlopen(url)
except: # python2
page = urlopen(url)
page = urllib.request.urlopen(url)
soup = BeautifulSoup(page, "html.parser")
# print((soup.text))
if (soup.text == "false"):
Expand Down
15 changes: 12 additions & 3 deletions gsoc/zheyuan/pipeline/generate_templates.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,16 @@
import argparse
from paraphrase_questions import get_pretrained_model,prepare_model,set_seed
from get_properties import get_properties
from generate_url import generate_url
from sentence_and_template_generator import sentence_and_template_generator
import os
from fetch_ranks_sub import fetch_ranks
import logging
from constant import Constant

const = Constant()

const.URL = "https://datascience-models-ramsri.s3.amazonaws.com/t5_paraphraser.zip"

def generate_templates(label,project_name,depth=1,output_file="sentence_and_template_generator"):
"""
Expand Down Expand Up @@ -32,21 +38,24 @@ def generate_templates(label,project_name,depth=1,output_file="sentence_and_temp
logging.basicConfig(filename=project_name+"/logfile.log", format='%(filename)s: %(message)s', filemode='w')

# Setting threshold level
logger.setLevel(logging.DEBUG)
logger.setLevel(logging.WARNING)

# Use the logging methods
#logger.debug("This is a debug message")
logger.info("This is a log file.")
#logger.warning("This is a warning message")
#logger.error("This is an error message")
#logger.critical("This is a critical message")
#logger.critical("This is a critical message")
folder_path = get_pretrained_model(const.URL)
set_seed(42)
tokenizer, device, model = prepare_model(folder_path)

list_of_property_information = get_properties(url=url,project_name=project_name,output_file = "get_properties.csv")
for property_line in list_of_property_information:
count+=1
prop = property_line.split(',')
print("**************\n"+str(prop))
sentence_and_template_generator(expand_set=expand_set, original_count=depth,prop_dic=prop_dic,test_set=test_set,log=logger,diction=diction,output_file=output_file,mother_ontology=about.strip().replace("http://dbpedia.org/ontology/","dbo:"),vessel=vessel,project_name=project_name ,prop=prop, suffix = " of <A> ?",count = depth)
sentence_and_template_generator(original_count=depth,prop_dic=prop_dic,test_set=test_set,log=logger,diction=diction,output_file=output_file,mother_ontology=about.strip().replace("http://dbpedia.org/ontology/","dbo:"),vessel=vessel,project_name=project_name ,prop=prop, suffix = " of <A> ?",count = depth,expand_set=expand_set,tokenizer=tokenizer,device=device,model=model)
output_file.close()

if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion gsoc/zheyuan/pipeline/generate_url.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def generate_url(given_label):
#print("Sub-class of: "+val['rdfs:subClassOf']['@rdf:resource'])
pass
url = val['prov:wasDerivedFrom']['@rdf:resource']
#print("URL:" + url)
# print("URL:" + get_url(url))
if(given_label == val['@rdf:about'].split('http://dbpedia.org/ontology/')[-1]):
return [get_url(url),about]
return ["None","None"]
Expand Down
26 changes: 14 additions & 12 deletions gsoc/zheyuan/pipeline/paraphrase_questions.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,23 +30,23 @@ def get_pretrained_model(zip_file_url):
print('Finish {}'.format(model_name))
return folder_path

def prepare_model(folder_path):
model = T5ForConditionalGeneration.from_pretrained(folder_path)
tokenizer = T5Tokenizer.from_pretrained('t5-base')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device ", device)
model = model.to(device)
return tokenizer, device, model

def set_seed(seed):
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)


def paraphrase_questions(sentence):
def paraphrase_questions(tokenizer, device, model, sentence):
sentence = sentence.replace("<A>", "XYZ")
folder_path = get_pretrained_model(const.URL)
set_seed(42)

model = T5ForConditionalGeneration.from_pretrained(folder_path)
tokenizer = T5Tokenizer.from_pretrained('t5-base')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device ", device)
model = model.to(device)

text = "paraphrase: " + sentence + " </s>"

Expand Down Expand Up @@ -103,6 +103,8 @@ def paraphrase_questions(sentence):

# project_name = args.project_name
# depth = args.depth

print(paraphrase_questions(sentence))
folder_path = get_pretrained_model(const.URL)
set_seed(42)
tokenizer, device, model = prepare_model(folder_path)
print(paraphrase_questions(tokenizer,device,model,sentence))
pass
38 changes: 22 additions & 16 deletions gsoc/zheyuan/pipeline/sentence_and_template_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,10 @@ def rank_check(query,diction,count,original_count):
ques = ques+"?x"+str(value+1)+" "
query = query.replace("(?a)","(?a)"+ ques) + " order by RAND() limit 100"
#print(query)
query = urllib.parse.quote(query)
try: # python3
query = urllib.parse.quote_plus(query)
except: # python2
query = urllib.quote_plus(query)
url = "https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query="+query+"&format=text%2Fhtml&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+"
# url = "https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query="+query + \
# "&format=application%2Fsparql-results%2Bjson&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+"
Expand Down Expand Up @@ -70,7 +73,7 @@ def check_query(log,query):
log.error(query_original )


def sentence_and_template_generator(expand_set, prop_dic,test_set,log,mother_ontology,vessel,prop,project_name,output_file,diction,original_count=0,count=0, suffix = " of <A> ?", query_suffix = ""):
def sentence_and_template_generator(prop_dic,test_set,log,mother_ontology,vessel,prop,project_name,output_file,diction,expand_set,tokenizer,device,model,original_count=0,count=0, suffix = " of <A> ?", query_suffix = ""):

if(type(prop)==str):
prop = prop.split(',')
Expand Down Expand Up @@ -111,15 +114,7 @@ def sentence_and_template_generator(expand_set, prop_dic,test_set,log,mother_ont
original_sparql = query_starts_with[number]+"where { <A> "+ query_suffix + prop_link +" ?x "+ query_ends_with[number]
natural_language_question.append(original_question)
sparql_query.append(original_sparql)
if count == original_count:

# candidates = paraphrase_questions(original_question.replace("<A>", "XYZ"))
# @todo establish a cirteria to determine whether to expand the current template pair
# For instance, just randomly pick up the first one from the candidates to expand templates
# and restore it with the orignial SPARQL template
# expanded_nl_question.append(candidates[0][0])
# expanded_sparql_query.append(original_sparql)
pass

if(query_suffix==""):
query_answer = ("select distinct(?a) where { ?a "+prop_link+" [] } ")
Expand All @@ -144,20 +139,31 @@ def sentence_and_template_generator(expand_set, prop_dic,test_set,log,mother_ont
#for temp_counter in range(original_count):
if( not prop[0] in prop_dic[original_count-count-1]):
for number in range(len(natural_language_question)):
if count == original_count-1:
candidates = paraphrase_questions(tokenizer,device,model,original_question)
# @todo establish a cirteria to determine whether to expand the current template pair
# For instance, just randomly pick up the first one from the candidates to expand templates
# and store it with the orignial SPARQL template
expanded_nl_question.append(candidates[0][0])
expanded_sparql_query.append(original_sparql)
pass
if expanded_sparql_query:
expand_line = [mother_ontology,"","",expanded_nl_question[number],expanded_sparql_query[number],query_answer]
expand_set.write((';'.join(expand_line)+";"+str(rank)+"\n").replace(" "," "))
vessel.append([mother_ontology,"","",natural_language_question[number],sparql_query[number],query_answer])
output_file.write((';'.join(vessel[-1])+";"+str(rank)+"\n").replace(" "," "))
log.info(';'.join(vessel[-1])+str(rank)+"\n")

else:
for number in range(len(natural_language_question)):
if expanded_sparql_query:
expand_line = [mother_ontology,"","",expanded_sparql_query[number],expanded_sparql_query[number],query_answer]
expand_set.write((';'.join(expand_line)+";"+str(rank)+"\n").replace(" "," "))
else:
for number in range(len(natural_language_question)):
vessel.append([mother_ontology,"","",natural_language_question[number],sparql_query[number],query_answer])
test_set.write((';'.join(vessel[-1])+";"+str(rank)+"\n").replace(" "," "))
print("++++++++++++++++++++",vessel[-1],"+++++++++++++++")
log.info("Test: "+';'.join(vessel[-1])+str(rank)+"\n")
if expanded_sparql_query:
expand_line = [mother_ontology,"","",expanded_sparql_query[number],expanded_sparql_query[number],query_answer]
expand_set.write((';'.join(expand_line)+";"+str(rank)+"\n").replace(" "," "))

prop_dic[original_count-count-1].append(prop[0])
#print(str(natural_language_question)+"\n"+str(sparql_query)+"\n"+query_answer+"\n*************")

Expand All @@ -172,5 +178,5 @@ def sentence_and_template_generator(expand_set, prop_dic,test_set,log,mother_ont
list_of_property_information = get_properties(url=url,project_name=project_name,output_file =prop[1]+".csv" )
for property_line in tqdm(list_of_property_information):
prop_inside = property_line.split(',')
sentence_and_template_generator(expand_set=expand_set, prop_dic=prop_dic,test_set= test_set,log=log,original_count=original_count,diction=diction,output_file=output_file, mother_ontology=mother_ontology,vessel=vessel,prop=prop_inside, suffix = suffix,count = count, project_name=project_name, query_suffix = query_suffix )
sentence_and_template_generator(expand_set=expand_set, prop_dic=prop_dic,test_set= test_set,log=log,original_count=original_count,diction=diction,output_file=output_file, mother_ontology=mother_ontology,vessel=vessel,prop=prop_inside, suffix = suffix,count = count, project_name=project_name, query_suffix = query_suffix,tokenizer=tokenizer,device=device,model=model)

0 comments on commit 1b6a50e

Please sign in to comment.