Little reconstruction of the pipeline of paraphrasing

LiberAI · Jun 14, 2020 · 1b6a50e · 1b6a50e
1 parent d528f4c
commit 1b6a50e
Show file tree

Hide file tree

Showing 7 changed files with 57 additions and 48 deletions.
diff --git a/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/generate_url.py b/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/generate_url.py
@@ -3,18 +3,14 @@
 import json
 import sys
 import urllib
-from urllib2 import urlopen
 import argparse
 from bs4 import BeautifulSoup
 
 
 def get_url(url):
         """Fuction to extract the http://mappings.dbpedia.org/server/ontology/classes/<some entity>
         page link for the given http://mappings.dbpedia.org/index.php/OntologyClass:<some entity>"""
-        try: #python3
-                page = urllib.request.urlopen(url)
-        except: #python2
-                page = urlopen(url)
+        page = urllib.request.urlopen(url)
         soup = BeautifulSoup(page, "html.parser")
         link = soup.findAll('a', attrs={"rel": "nofollow"})[0]['href']
         return link

diff --git a/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/get_properties.py b/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/get_properties.py
@@ -1,5 +1,4 @@
 import urllib
-from urllib2 import urlopen
 import json
 import sys
 import csv
@@ -18,10 +17,7 @@ class related information and data types as field values in each row.
     - This function also returns a 2D list of the information mentioned above to the calling
     function
     """
-    try:  # python3
-        page = urllib.request.urlopen(url)
-    except:  # python2
-        page = urlopen(url)
+    page = urllib.request.urlopen(url)
     soup = BeautifulSoup(page, "html.parser")
     if(not os.path.isdir(project_name)):
         os.makedirs(project_name)

diff --git a/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/sentence_and_template_generator.py b/gsoc/anand/pipeline_3/pipeline_3_with_controlled_test_set/sentence_and_template_generator.py
@@ -3,7 +3,7 @@
 from generate_url import generate_url_spec, generate_url
 from get_properties import get_properties
 import urllib
-from urllib2 import urlopen
+# from urllib2 import urlopen
 import urllib.parse
 from bs4 import BeautifulSoup
 import os
@@ -29,10 +29,10 @@ def rank_check(query, diction, count, original_count):
     # url = "https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query="+query + \
     #    "&format=application%2Fsparql-results%2Bjson&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+"
     # print(url)
-    try:  # python3
-        page = urllib.request.urlopen(url)
-    except:  # python2
-        page = urlopen(url)
+    # try:  # python3
+    #     page = urllib.request.urlopen(url)
+    # except:  # python2
+    page = urllib.request.urlopen(url)
     soup = BeautifulSoup(page, "html.parser")
     total = len(soup.find_all("tr"))
     accum = 0
@@ -66,7 +66,7 @@ def check_query(log, query):
     try:  # python3
         page = urllib.request.urlopen(url)
     except:  # python2
-        page = urlopen(url)
+        page = urllib.request.urlopen(url)
     soup = BeautifulSoup(page, "html.parser")
     # print((soup.text))
     if (soup.text == "false"):

diff --git a/gsoc/zheyuan/pipeline/generate_templates.py b/gsoc/zheyuan/pipeline/generate_templates.py
@@ -1,10 +1,16 @@
 import argparse
+from paraphrase_questions import get_pretrained_model,prepare_model,set_seed
 from get_properties import get_properties
 from generate_url import generate_url
 from sentence_and_template_generator import sentence_and_template_generator
 import os
 from fetch_ranks_sub import fetch_ranks
 import logging
+from constant import Constant
+
+const = Constant()
+
+const.URL = "https://datascience-models-ramsri.s3.amazonaws.com/t5_paraphraser.zip"
 
 def generate_templates(label,project_name,depth=1,output_file="sentence_and_template_generator"):
     """
@@ -32,21 +38,24 @@ def generate_templates(label,project_name,depth=1,output_file="sentence_and_temp
     logging.basicConfig(filename=project_name+"/logfile.log", format='%(filename)s: %(message)s', filemode='w')
 
     # Setting threshold level
-    logger.setLevel(logging.DEBUG)
+    logger.setLevel(logging.WARNING)
 
     # Use the logging methods
     #logger.debug("This is a debug message")  
     logger.info("This is a log file.")  
     #logger.warning("This is a warning message")  
     #logger.error("This is an error message")  
-    #logger.critical("This is a critical message")   
+    #logger.critical("This is a critical message")
+    folder_path = get_pretrained_model(const.URL)
+    set_seed(42)
+    tokenizer, device, model = prepare_model(folder_path)
 
     list_of_property_information = get_properties(url=url,project_name=project_name,output_file = "get_properties.csv")
     for property_line in list_of_property_information:
         count+=1
         prop = property_line.split(',')
         print("**************\n"+str(prop))
-        sentence_and_template_generator(expand_set=expand_set, original_count=depth,prop_dic=prop_dic,test_set=test_set,log=logger,diction=diction,output_file=output_file,mother_ontology=about.strip().replace("http://dbpedia.org/ontology/","dbo:"),vessel=vessel,project_name=project_name ,prop=prop, suffix = " of <A> ?",count = depth)
+        sentence_and_template_generator(original_count=depth,prop_dic=prop_dic,test_set=test_set,log=logger,diction=diction,output_file=output_file,mother_ontology=about.strip().replace("http://dbpedia.org/ontology/","dbo:"),vessel=vessel,project_name=project_name ,prop=prop, suffix = " of <A> ?",count = depth,expand_set=expand_set,tokenizer=tokenizer,device=device,model=model)
     output_file.close()    
 
 if __name__ == "__main__":

diff --git a/gsoc/zheyuan/pipeline/generate_url.py b/gsoc/zheyuan/pipeline/generate_url.py
@@ -45,7 +45,7 @@ def generate_url(given_label):
                                         #print("Sub-class of: "+val['rdfs:subClassOf']['@rdf:resource'])
                                         pass
                         url = val['prov:wasDerivedFrom']['@rdf:resource']
-                        #print("URL:" + url)
+                        # print("URL:" + get_url(url))
                         if(given_label == val['@rdf:about'].split('http://dbpedia.org/ontology/')[-1]):
                                 return [get_url(url),about]
         return ["None","None"]

diff --git a/gsoc/zheyuan/pipeline/paraphrase_questions.py b/gsoc/zheyuan/pipeline/paraphrase_questions.py
@@ -30,23 +30,23 @@ def get_pretrained_model(zip_file_url):
     print('Finish {}'.format(model_name))
     return folder_path
 
+def prepare_model(folder_path):
+    model = T5ForConditionalGeneration.from_pretrained(folder_path)
+    tokenizer = T5Tokenizer.from_pretrained('t5-base')
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print("device ", device)
+    model = model.to(device)
+    return tokenizer, device, model
+
 def set_seed(seed):
   torch.manual_seed(seed)
   if torch.cuda.is_available():
     torch.cuda.manual_seed_all(seed)
 
 
-def paraphrase_questions(sentence):
+def paraphrase_questions(tokenizer, device, model, sentence):
     sentence = sentence.replace("<A>", "XYZ")
-    folder_path = get_pretrained_model(const.URL)
-    set_seed(42)
-
-    model = T5ForConditionalGeneration.from_pretrained(folder_path)
-    tokenizer = T5Tokenizer.from_pretrained('t5-base')
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    print("device ", device)
-    model = model.to(device)
 
     text = "paraphrase: " + sentence + " </s>"
 
@@ -103,6 +103,8 @@ def paraphrase_questions(sentence):
 
     # project_name = args.project_name
     # depth = args.depth
-
-    print(paraphrase_questions(sentence))
+    folder_path = get_pretrained_model(const.URL)
+    set_seed(42)
+    tokenizer, device, model = prepare_model(folder_path)
+    print(paraphrase_questions(tokenizer,device,model,sentence))
     pass
diff --git a/gsoc/zheyuan/pipeline/sentence_and_template_generator.py b/gsoc/zheyuan/pipeline/sentence_and_template_generator.py
@@ -21,7 +21,10 @@ def rank_check(query,diction,count,original_count):
             ques = ques+"?x"+str(value+1)+" "
     query = query.replace("(?a)","(?a)"+ ques) + " order by RAND() limit 100"
     #print(query)
-    query = urllib.parse.quote(query)
+    try:  # python3
+        query = urllib.parse.quote_plus(query)
+    except:  # python2
+        query = urllib.quote_plus(query)
     url = "https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query="+query+"&format=text%2Fhtml&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+"
     # url = "https://dbpedia.org/sparql?default-graph-uri=http%3A%2F%2Fdbpedia.org&query="+query + \
     #    "&format=application%2Fsparql-results%2Bjson&CXML_redir_for_subjs=121&CXML_redir_for_hrefs=&timeout=30000&debug=on&run=+Run+Query+"
@@ -70,7 +73,7 @@ def check_query(log,query):
         log.error(query_original )
 
 
-def sentence_and_template_generator(expand_set, prop_dic,test_set,log,mother_ontology,vessel,prop,project_name,output_file,diction,original_count=0,count=0, suffix = " of <A> ?", query_suffix = ""):
+def sentence_and_template_generator(prop_dic,test_set,log,mother_ontology,vessel,prop,project_name,output_file,diction,expand_set,tokenizer,device,model,original_count=0,count=0, suffix = " of <A> ?", query_suffix = ""):
 
     if(type(prop)==str):
         prop = prop.split(',')
@@ -111,15 +114,7 @@ def sentence_and_template_generator(expand_set, prop_dic,test_set,log,mother_ont
         original_sparql = query_starts_with[number]+"where { <A>  "+ query_suffix + prop_link  +" ?x "+ query_ends_with[number]
         natural_language_question.append(original_question)
         sparql_query.append(original_sparql)
-        if count == original_count:
 
-            # candidates = paraphrase_questions(original_question.replace("<A>", "XYZ"))
-            # @todo establish a cirteria to determine whether to expand the current template pair
-            # For instance, just randomly pick up the first one from the candidates to expand templates
-            # and restore it with the orignial SPARQL template
-            # expanded_nl_question.append(candidates[0][0])
-            # expanded_sparql_query.append(original_sparql)
-            pass
 
     if(query_suffix==""):
         query_answer = ("select distinct(?a) where { ?a "+prop_link+" []  } ")
@@ -144,20 +139,31 @@ def sentence_and_template_generator(expand_set, prop_dic,test_set,log,mother_ont
     #for temp_counter in range(original_count):
     if( not prop[0] in prop_dic[original_count-count-1]): 
         for number in range(len(natural_language_question)):
+            if count == original_count-1:
+                candidates = paraphrase_questions(tokenizer,device,model,original_question)
+                # @todo establish a cirteria to determine whether to expand the current template pair
+                # For instance, just randomly pick up the first one from the candidates to expand templates
+                # and store it with the orignial SPARQL template
+                expanded_nl_question.append(candidates[0][0])
+                expanded_sparql_query.append(original_sparql)
+                pass
+            if expanded_sparql_query:
+                expand_line = [mother_ontology,"","",expanded_nl_question[number],expanded_sparql_query[number],query_answer]
+                expand_set.write((';'.join(expand_line)+";"+str(rank)+"\n").replace("  "," "))
             vessel.append([mother_ontology,"","",natural_language_question[number],sparql_query[number],query_answer])
             output_file.write((';'.join(vessel[-1])+";"+str(rank)+"\n").replace("  "," "))
             log.info(';'.join(vessel[-1])+str(rank)+"\n")
+
+    else:
+        for number in range(len(natural_language_question)):
             if expanded_sparql_query:
                 expand_line = [mother_ontology,"","",expanded_sparql_query[number],expanded_sparql_query[number],query_answer]
                 expand_set.write((';'.join(expand_line)+";"+str(rank)+"\n").replace("  "," "))
-    else:
-        for number in range(len(natural_language_question)):
             vessel.append([mother_ontology,"","",natural_language_question[number],sparql_query[number],query_answer])
             test_set.write((';'.join(vessel[-1])+";"+str(rank)+"\n").replace("  "," "))
+            print("++++++++++++++++++++",vessel[-1],"+++++++++++++++")
             log.info("Test: "+';'.join(vessel[-1])+str(rank)+"\n")
-            if expanded_sparql_query:
-                expand_line = [mother_ontology,"","",expanded_sparql_query[number],expanded_sparql_query[number],query_answer]
-                expand_set.write((';'.join(expand_line)+";"+str(rank)+"\n").replace("  "," "))
+
     prop_dic[original_count-count-1].append(prop[0])
     #print(str(natural_language_question)+"\n"+str(sparql_query)+"\n"+query_answer+"\n*************")
 
@@ -172,5 +178,5 @@ def sentence_and_template_generator(expand_set, prop_dic,test_set,log,mother_ont
         list_of_property_information = get_properties(url=url,project_name=project_name,output_file =prop[1]+".csv" )
         for property_line in tqdm(list_of_property_information):
             prop_inside = property_line.split(',')
-            sentence_and_template_generator(expand_set=expand_set, prop_dic=prop_dic,test_set= test_set,log=log,original_count=original_count,diction=diction,output_file=output_file, mother_ontology=mother_ontology,vessel=vessel,prop=prop_inside, suffix = suffix,count = count, project_name=project_name, query_suffix = query_suffix )
+            sentence_and_template_generator(expand_set=expand_set, prop_dic=prop_dic,test_set= test_set,log=log,original_count=original_count,diction=diction,output_file=output_file, mother_ontology=mother_ontology,vessel=vessel,prop=prop_inside, suffix = suffix,count = count, project_name=project_name, query_suffix = query_suffix,tokenizer=tokenizer,device=device,model=model)