From 60873ca5030526cc4c320e881b127eca281623e7 Mon Sep 17 00:00:00 2001 From: LLTommy Date: Wed, 27 Sep 2017 16:03:20 +0100 Subject: [PATCH 01/66] First try to fix https issue --- oxo-web/src/main/resources/templates/about.html | 6 +++--- oxo-web/src/main/resources/templates/contact.html | 4 ++-- oxo-web/src/main/resources/templates/datasource.html | 6 +++--- oxo-web/src/main/resources/templates/docs.html | 8 ++++---- oxo-web/src/main/resources/templates/index.html | 8 ++++---- oxo-web/src/main/resources/templates/login.html | 8 ++++---- oxo-web/src/main/resources/templates/mapping.html | 6 +++--- oxo-web/src/main/resources/templates/mappings.html | 8 ++++---- oxo-web/src/main/resources/templates/myaccount.html | 8 ++++---- oxo-web/src/main/resources/templates/search.html | 2 +- oxo-web/src/main/resources/templates/terms.html | 8 ++++---- 11 files changed, 36 insertions(+), 36 deletions(-) diff --git a/oxo-web/src/main/resources/templates/about.html b/oxo-web/src/main/resources/templates/about.html index 17c0da9..ba042f7 100644 --- a/oxo-web/src/main/resources/templates/about.html +++ b/oxo-web/src/main/resources/templates/about.html @@ -4,7 +4,7 @@ - + @@ -34,8 +34,8 @@ - - + + diff --git a/oxo-web/src/main/resources/templates/contact.html b/oxo-web/src/main/resources/templates/contact.html index 502ed80..34af943 100644 --- a/oxo-web/src/main/resources/templates/contact.html +++ b/oxo-web/src/main/resources/templates/contact.html @@ -34,8 +34,8 @@ - - + + diff --git a/oxo-web/src/main/resources/templates/datasource.html b/oxo-web/src/main/resources/templates/datasource.html index 8d621d3..3453993 100644 --- a/oxo-web/src/main/resources/templates/datasource.html +++ b/oxo-web/src/main/resources/templates/datasource.html @@ -21,7 +21,7 @@ - + @@ -34,8 +34,8 @@ - - + + diff --git a/oxo-web/src/main/resources/templates/docs.html b/oxo-web/src/main/resources/templates/docs.html index 406dbbf..6835ebf 100644 --- a/oxo-web/src/main/resources/templates/docs.html +++ b/oxo-web/src/main/resources/templates/docs.html @@ -4,7 +4,7 @@ - + @@ -21,7 +21,7 @@ - + @@ -34,8 +34,8 @@ - - + + diff --git a/oxo-web/src/main/resources/templates/index.html b/oxo-web/src/main/resources/templates/index.html index bd4dc76..5cbac3c 100644 --- a/oxo-web/src/main/resources/templates/index.html +++ b/oxo-web/src/main/resources/templates/index.html @@ -4,7 +4,7 @@ - + @@ -21,7 +21,7 @@ - + @@ -34,8 +34,8 @@ - - + + diff --git a/oxo-web/src/main/resources/templates/login.html b/oxo-web/src/main/resources/templates/login.html index e1ece60..9271b1b 100644 --- a/oxo-web/src/main/resources/templates/login.html +++ b/oxo-web/src/main/resources/templates/login.html @@ -4,7 +4,7 @@ - + @@ -21,7 +21,7 @@ - + @@ -34,8 +34,8 @@ - - + + diff --git a/oxo-web/src/main/resources/templates/mapping.html b/oxo-web/src/main/resources/templates/mapping.html index ee34a54..b4c6964 100644 --- a/oxo-web/src/main/resources/templates/mapping.html +++ b/oxo-web/src/main/resources/templates/mapping.html @@ -21,7 +21,7 @@ - + @@ -35,8 +35,8 @@ - - + + diff --git a/oxo-web/src/main/resources/templates/mappings.html b/oxo-web/src/main/resources/templates/mappings.html index 91b97c8..798ed55 100644 --- a/oxo-web/src/main/resources/templates/mappings.html +++ b/oxo-web/src/main/resources/templates/mappings.html @@ -4,7 +4,7 @@ - + @@ -21,7 +21,7 @@ - + @@ -35,8 +35,8 @@ - - + + diff --git a/oxo-web/src/main/resources/templates/myaccount.html b/oxo-web/src/main/resources/templates/myaccount.html index d871de6..d2599a5 100644 --- a/oxo-web/src/main/resources/templates/myaccount.html +++ b/oxo-web/src/main/resources/templates/myaccount.html @@ -4,7 +4,7 @@ - + @@ -21,7 +21,7 @@ - + @@ -34,8 +34,8 @@ - - + + diff --git a/oxo-web/src/main/resources/templates/search.html b/oxo-web/src/main/resources/templates/search.html index 3c21bc1..5c2710c 100644 --- a/oxo-web/src/main/resources/templates/search.html +++ b/oxo-web/src/main/resources/templates/search.html @@ -21,7 +21,7 @@ - + diff --git a/oxo-web/src/main/resources/templates/terms.html b/oxo-web/src/main/resources/templates/terms.html index b7cd824..1bbcb8a 100644 --- a/oxo-web/src/main/resources/templates/terms.html +++ b/oxo-web/src/main/resources/templates/terms.html @@ -4,7 +4,7 @@ - + @@ -21,7 +21,7 @@ - + @@ -35,8 +35,8 @@ - - + + From 23787e3c4de81643e4f5342f3012399392c8e2f3 Mon Sep 17 00:00:00 2001 From: LLTommy Date: Tue, 21 Nov 2017 15:30:15 +0000 Subject: [PATCH 02/66] Adding paxo to dev oxo --- paxo/clientOperations.py | 300 ++++++++++++++++++++++++++++++++ paxo/flaskMapping.py | 360 +++++++++++++++++++++++++++++++++++++++ paxo/readme.txt | 27 +++ paxo/requirements.txt | 3 + paxo/validation.py | 165 ++++++++++++++++++ 5 files changed, 855 insertions(+) create mode 100644 paxo/clientOperations.py create mode 100644 paxo/flaskMapping.py create mode 100644 paxo/readme.txt create mode 100644 paxo/requirements.txt create mode 100644 paxo/validation.py diff --git a/paxo/clientOperations.py b/paxo/clientOperations.py new file mode 100644 index 0000000..10a1674 --- /dev/null +++ b/paxo/clientOperations.py @@ -0,0 +1,300 @@ +import flaskMapping +import logging +import validation +import csv +import time +import requests +import json +from ConfigParser import SafeConfigParser +import ast + +url="https://www.ebi.ac.uk/ols/api/" + +#Compares to ontologies from the OLS. This process can take a while and procudes a csv with primary results +def scoreOntologies(sourceOntology, targetOntology): + logging.info("Start scoring "+sourceOntology+" and "+targetOntology) + #Check for the smaller ontology + r = requests.get(url+"ontologies/"+sourceOntology) + numberOfTerms=r.json()['numberOfTerms'] + r = requests.get(url+"ontologies/"+targetOntology) + numberOfTerms2 = r.json()['numberOfTerms'] + + #In case the targetOntology is smaller than the source Ontology, switch the output + if (numberOfTerms>numberOfTerms2): + tmpOntology=sourceOntology + sourceOntology=targetOntology + targetOntology=tmpOntology + + termsUrl=url+"ontologies/"+sourceOntology+"/terms?size=500&fieldList=iri,label,synonym" + results=[] + + results.append(["sourceLabel","sourceIRI", "fuzzy", "oxo", "synFuzzy", "synOxo", "bridgeTerms"]) + counter=0 + while True: + try: + r = requests.get(termsUrl) + except Exception as e: + print "Error with webservice call" + print e + logging.info("Error with webservice call") + logging.info(r.url) + logging.info(r.status_code) + raise e + + for term in r.json()['_embedded']['terms']: + originalLabel=term["label"] + synonyms=term["synonyms"] + + #Check if the term is actually defined in that ontology + if term['is_defining_ontology'] is True: + try: + pscore=flaskMapping.scoreTermOLS(term["iri"], originalLabel, targetOntology, {}) + calculatedMappings=flaskMapping.processPScore(pscore) + except Exception as e: + print "Exception in primary Scoring" + print e + print term["iri"] + print originalLabel + print targetOntology + logging.info("Exception in primary Scoring") + logging.info(term["iri"]+" "+originalLabel) + calculatedMappings={'sourceTerm':term["iri"]+"ERROR", "olsFuzzyScore": [], "oxoScore": [], "bridgeEvidence": []} + + #If synonyms are available, run through the same steps with synonyms to score an ontology + synCalculatedMappings={} + if synonyms!=None: + for synonym in synonyms: + try: + synPscore=flaskMapping.primaryScoreTerm('', synonym, targetOntology) + synCalculatedMappings=flaskMapping.processPScore(synPscore) #Process the primaryScore for synonyms + synCalculatedMappings['sourceIRI']=term["iri"] + except Exception as e: + print "Exception in Synonym processPScore Term" + print e + synCalculatedMappings={'sourceTerm':term["iri"]+"ERROR", "olsFuzzyScore": [], "oxoScore": [], "bridgeEvidence": []} + logging.info("Exception in Synonym processPScore Term") + logging.info(term["iri"]+" "+synonym+" "+targetOntology) + synCalculatedMappings['olsFuzzyScore']=[{'fuzzyScore': 0, 'fuzzyMapping': 'UNKNOWN - ERROR', 'fuzzyIri': 'UNKNOWN - ERROR'}] + synCalculatedMappings['oxoScore']=[{'distance': 0, 'oxoCurie': 'UNKNOWN', 'oxoScore': 0}] + synCalculatedMappings['sourceIRI']=term["iri"] + + else: + synCalculatedMappings['olsFuzzyScore']=[{'fuzzyScore': 0, 'fuzzyMapping': 'UNKNOWN', 'fuzzyIri': 'UNKNOWN'}] + synCalculatedMappings['oxoScore']=[{'distance': 0, 'oxoCurie': 'UNKNOWN', 'oxoScore': 0}] + + results.append([originalLabel.encode(encoding='UTF-8'), term["iri"].encode(encoding='UTF-8'), calculatedMappings['olsFuzzyScore'], calculatedMappings['oxoScore'], synCalculatedMappings['olsFuzzyScore'], synCalculatedMappings['oxoScore'], calculatedMappings['bridgeEvidence']]) + + + try: + termsUrl=r.json()['_links']['next']['href'] + ###This is just temporary to not process all the stuff but abort after two pages + counter=counter+1 + if counter%2==0: + print "Processed "+str(counter)+" pages" + logging.info("Processed "+str(counter)+" pages") + #break # Not necessary if we want to parse whole ontology, just activate this for testing + except: + logging.info("Reached last page I recon") + print "Reached last page I recon" + break + + with open('pipeline_output/scoring_output_'+sourceOntology+'_'+targetOntology+'.csv', 'w') as f: + writer = csv.writer(f) + writer.writerows(results) + f.close() + +#Read in and process the ontology primary score from a csv file +def scoreOntologyPrimaryScore(name): + with open("pipeline_output/"+name+"/scoring_output_"+name+".csv") as csvfile: + readCSV = csv.reader(csvfile, delimiter=',') + + scoreMatrix=[] + next(readCSV) #Skip csv header + for row in readCSV: + originalLabel=row[0] + orginaliri=row[1] + tmp=row[2] + fuzzy=ast.literal_eval(tmp) + tmp=row[3] + oxo=ast.literal_eval(tmp) + tmp=row[4] + synFuzzy=ast.literal_eval(tmp) + tmp=row[5] + synOxo=ast.literal_eval(tmp) + + for i in fuzzy: + obj={"sourceTerm":originalLabel, "sourceIRI":orginaliri ,"iri": i['fuzzyIri'], "olsFuzzyScore": fuzzy, "oxoScore": oxo, "synFuzzy": synFuzzy, "synOxo":synOxo} + scoreMatrix.append(obj) + + for i in oxo: + obj={"sourceTerm":originalLabel, "sourceIRI":orginaliri, "iri": i['oxoCurie'], "olsFuzzyScore": fuzzy, "oxoScore": oxo, "synFuzzy": synFuzzy, "synOxo":synOxo} + scoreMatrix.append(obj) + + for i in synFuzzy: + obj={"sourceTerm":originalLabel, "sourceIRI":orginaliri, "iri": i['fuzzyIri'], "olsFuzzyScore": fuzzy, "oxoScore": oxo, "synFuzzy": synFuzzy, "synOxo":synOxo} + scoreMatrix.append(obj) + + for i in synOxo: + obj={"sourceTerm":originalLabel, "sourceIRI":orginaliri, "iri": i['oxoCurie'], "olsFuzzyScore": fuzzy, "oxoScore": oxo, "synFuzzy": synFuzzy, "synOxo":synOxo} + scoreMatrix.append(obj) + + + simplerMatrix=[] + #Calls simplifyProcessedPscore for ever line in scoreMatrix that we just read in + for line in scoreMatrix: + pScore=flaskMapping.simplifyProcessedPscore(line) + if pScore not in simplerMatrix: + simplerMatrix.append(pScore) + + return simplerMatrix + +#Takes simplified input and actually calculates the finale score +def processOntologyPrimaryScore(pScore, params): + result=[] + for line in pScore: + singleLineResult=flaskMapping.scoreSimple(line, params) + result.append(singleLineResult) + + #Take the highest scored mappings + tmp=[] + for entry in result: + if entry!=[]: + tmp.append(entry) + #else: + # print "entry in results!" + + #SortScore + tmp=sorted(tmp, key=lambda tmp:tmp[0]['finaleScore'], reverse=True) + return tmp + +#Maybe transfer to server +def scoreTermList(termList, targetOntology, params): + result=[] + for term in termList: + result.append(flaskMapping.scoreTermLabel(term, targetOntology, params)) + return result + +# Process an IRI list via OLS instead of a termList +# def scoreIriList(IriList, targetOntology, params): + +#Process scoredMatrix to prepare for validation or save to disc +def writeOutPutScore(scoredMatrix, name, saveToDisc): + result=[] + for line in scoredMatrix: + result.append([line[0]['sourceIRI'], line[0]['iri'], line[0]['finaleScore'], line[0]['sourceTerm']]) + + if saveToDisc==True: + with open('pipeline_output/calculated_output_'+name+'.csv', 'w') as f: + writer = csv.writer(f) + writer.writerows(result) + f.close() + + return result + +#Not implemented yet #Remove double entry stuff #Potentially +def curationOntologyFinalScore(scoredMatrix): + endmap=[] + unified=[] + #print scoredMatrix + for counter, line in enumerate(scoredMatrix): + print line + + if line[1] not in endmap: + endmap.append(line[1]) + unified.append(line) + else: + print "Double entry Found!!! Will replace now! " + index=endmap.index(line[1]) + if unified[index][2]0: + for row in jsonReply['mappingResponseList']: + + ##Additional webservice call to get the stupid long IRI out of oxo + oxoMapURL="https://www.ebi.ac.uk/spot/oxo/api/mappings" + data={"fromId":row['curie']} + longId=apiCall(oxoMapURL, data) + longId=longId.json()['_embedded']['mappings'][0]['fromTerm']['uri'] ## + tmpList.append({"curie":longId, "distance":row['distance']}) + #tmpList.append({"curie":row['curie'], "distance":row['distance']}) + sortedCurie=sorted(tmpList, key=lambda tmpList: tmpList['distance'], reverse=False) + else: + sortedCurie=[{"curie":"UNKNOWN", "distance": 0}] + return sortedCurie + except Exception as e: + print "Problem with oxo:" + print e + print "Termlabel: "+termLabel + print "TargetOntolgy: "+targetOntology + return [{"curie":"UNKNOWN", "distance": 0}] + + +def stringProcess(term): + processedTerm=term.lower() #Make sting lower case + processedTerm=processedTerm.replace(" ", "") #Remove all spaces + + #Simply cut some things from the label before calculating the levenstein distance + for cut in cutList: + tmpArray=term.split(cut) #Remove problematic terms + if len(tmpArray[0])!=0: + processedTerm=tmpArray[0] + break + elif len(tmpArray[1])!=0: + processedTerm=tmpArray[1] + break + else: + print "Something is wrong" + break + + return processedTerm + +#Takes an input label and executes the fuzzyOLS call +def olsFuzzyMatch(termLabel, targetOntology): + data={"q":termLabel, "ontology":targetOntology, "type":"class", "local":True} + jsonReply=apiCall(searchURL, data) + termLabel=termLabel.encode(encoding='UTF-8') + + termLabel=stringProcess(termLabel) + + #WE found at least 1 hit + jsonReply=jsonReply.json()['response'] + if jsonReply['numFound']>0: + levList=[] + for reply in jsonReply['docs']: + try: + answerTerm=stringProcess(reply['label'].encode(encoding='UTF-8')) + lev=round(Levenshtein.ratio(termLabel, answerTerm), 5) + levList.append({"SourceLabel": termLabel, "SourceIRI": termLabel , "TargetIRI": reply['iri'], "TargetLabel": reply['label'], "lev":lev}) + except: + print "ERROR WITH LEV Distance, score 0 for now for these two" + print reply + print termLabel + levList.append({"SourceLabel": termLabel, "SourceIRI": termLabel , "TargetIRI": reply['iri'], "TargetLabel": reply['label']+"_ERROR", "lev":0}) + + sortedLev=sorted(levList, key=lambda levList:levList['lev'], reverse=True) + + else: + #print "No hits, therefore Add empty placeholder" + sortedLev=[{"SourceLabel": termLabel, "SourceIRI": termLabel , "TargetIRI": "UNKNOWN", "TargetLabel": "UNKNOWN", "lev": 0}] + + + ##Now let's relax The fuzzy search and aim for other (all) ontologies + data={"q":termLabel, "type":"class", "local":True, "limit":30} + jsonReply=apiCall(searchURL, data) + jsonReply=jsonReply.json()['response'] + oxoTargetList=[] + if jsonReply['numFound']>0: + for reply in jsonReply['docs']: + if reply['ontology_name']!=targetOntology: + oxoTargetList.append({"short_form": reply['short_form'],"bridgeOntology":reply['ontology_name']}) + + return {"fuzzyTerms": sortedLev, "bridgeTerms": oxoTargetList} + +#Executes the basic calls, delievers primary score (raw scoring) +def primaryScoreTerm(termIRI, termLabel, targetOntology): + olsFuzzyResult=olsFuzzyMatch(termLabel, targetOntology) + + if termIRI!='': + oxoResults=oxoMatch(termIRI, targetOntology) + else: + oxoResults=[{"curie":"UNKNOWN", "distance": 0}] + + bridgeTerms=olsFuzzyResult['bridgeTerms'] + olsFuzzyResult=olsFuzzyResult['fuzzyTerms'] + + logging.info("On the search for this fk bridge Terms") + logging.info(bridgeTerms) + bridgeOxo=[] + if len(bridgeTerms)>0: + for bridgeTerm in bridgeTerms: + tmp=oxoMatch(bridgeTerm['short_form'],targetOntology) + + for line in tmp: + if line['curie']!='UNKNOWN': + #print "FOUND a bridge OxO thing! Finally!" + #logging.info("FOUND a bridge OxO thing! Finally!") + bridgeOxo.append(tmp) + else: + bridgeOxo=[[{"curie":"UNKNOWN", "distance": 0}]] + else: + bridgeOxo=[[{"curie":"UNKNOWN", "distance": 0}]] + + + try: + bridgeOxo=bridgeOxo[0] + except e as Exception: + print "Error with that stupid list in bridgeOxo" + print termIRI + print termLabel + print bridgeOxo + bridgeOxo=[[{"curie":"UNKNOWN", "distance": 0}]] + + scoreTerm={"sourceTerm": termLabel, "targetOntology":targetOntology,"olsFuzzyScore":olsFuzzyResult, "oxoScore":oxoResults, "bridgeEvidence":bridgeOxo} + return scoreTerm + +#The fuction takes a primary score (raw) and calculates a corresponding score for each subresult +def processPScore(pScore): + mapping={'sourceTerm':pScore['sourceTerm'], "olsFuzzyScore": [], "oxoScore": [], "bridgeEvidence": []} + + + for fuzzy in pScore['olsFuzzyScore']: + mapping['olsFuzzyScore'].append({'fuzzyMapping': fuzzy['TargetLabel'], 'fuzzyIri':fuzzy['TargetIRI'],'fuzzyScore': fuzzy['lev']}) + + for oxo in pScore['oxoScore']: + tmpCurie=oxo['curie'] + oxoScore=int(oxo['distance']) + + if int(oxo['distance'])==0: + tmpCurie="UNKNOWN" + + mapping['oxoScore'].append({'oxoCurie':tmpCurie, "distance": oxo['distance'] ,"oxoScore":oxoScore}) + + #Is here there right place to do it? Unsure at the moment + for oxo in pScore['bridgeEvidence']: + tmpCurie=oxo['curie'] + oxoScore=int(oxo['distance']) + + if int(oxo['distance'])==0: + tmpCurie="UNKNOWN" + + mapping['bridgeEvidence'].append({'oxoCurie':tmpCurie, "distance": oxo['distance'], "oxoScore":oxoScore}) + + return mapping + +#Combines the subresults of the processPScore function. Results are combined line by line #So we combine the same results and add a new line for new entries +def simplifyProcessedPscore(mapping): + scoreMatrix=[] + sourceIRI=mapping['sourceIRI'] + + flag=False + + for line in mapping['olsFuzzyScore']: + if line['fuzzyScore']==[]: + line['fuzzyScore']=0 + obj={"sourceTerm":mapping['sourceTerm'], "sourceIRI":sourceIRI,"iri":line['fuzzyIri'], "fuzzyScore": line['fuzzyScore'], "oxoScore": 0, "synFuzzy":0, "synOxo": 0} + scoreMatrix.append(obj) + + flag=False + for line in mapping['oxoScore']: + for s in scoreMatrix: + if line["oxoCurie"]==s["iri"]: + s['oxoScore']=line['oxoScore'] + flag=True + + if flag==False: + obj={"sourceTerm":mapping['sourceTerm'], "sourceIRI":sourceIRI, "iri":line['oxoCurie'], "fuzzyScore": 0, "oxoScore": line['oxoScore'], "synFuzzy":0, "synOxo": 0} + scoreMatrix.append(obj) + + # Starting here we try to take care of synonyms! +# if mapping['synFuzzy']!=None and mapping['synOxo']!=None: + if 'synFuzzy' in mapping and 'synOxo' in mapping: + # Fuzzy Synonyms Score + flag=False + for line in mapping['synFuzzy']: + for s in scoreMatrix: + if line["fuzzyIri"]==s["iri"]: + #s['fuzzyScore']=line['fuzzyScore'] + s['synFuzzy']=line['fuzzyScore'] + flag=True + + if flag==False: + obj={"sourceTerm":mapping['sourceTerm'], "sourceIRI":sourceIRI,"iri":line['fuzzyIri'], "fuzzyScore":0, "oxoScore": 0, "synFuzzy": line['fuzzyScore'], "synOxo":0} + scoreMatrix.append(obj) + + # Oxo Synonyms Score + flag=False + for line in mapping['synOxo']: + for s in scoreMatrix: + if line["oxoCurie"]==s["iri"]: + #s['oxoScore']=line['oxoScore'] + s['synOxo']=line['oxoScore'] + flag=True + + if flag==False: + obj={"sourceTerm":mapping['sourceTerm'], "sourceIRI":sourceIRI, "iri":line['oxoCurie'], "fuzzyScore": 0, "oxoScore": 0, "synFuzzy":0, "synOxo": line['oxoScore']} + scoreMatrix.append(obj) + + else: + print "No Synonyms here" + + + #print "made it to the end of simplifyProcessedPscore" + # Should do this bridgy thing here + #if mapping['bridgeEvidence']==None + # + # + # + # + return scoreMatrix + +#Simple Score mechanism for all subscores, returns a sorted list. Is Called after simplifyProcessedPscore +def scoreSimple(scoreMatrix, params): + threshold=params['threshold'] + + oxoDistanceOne=params['oxoDistanceOne'] + oxoDistanceTwo=params['oxoDistanceTwo'] + oxoDistanceThree=params['oxoDistanceThree'] + + fuzzyUpperLimit=params['fuzzyUpperLimit'] + fuzzyLowerLimit=params['fuzzyLowerLimit'] + fuzzyUpperFactor=params['fuzzyUpperFactor'] + fuzzyLowerFactor=params['fuzzyLowerFactor'] + + synFuzzyFactor=params['synFuzzyFactor'] + synOxoFactor=params['synOxoFactor'] + + synFuzzyFactor=params['synFuzzyFactor'] + synOxoFactor=params['synOxoFactor'] + + resultMatrix=[] + for i,score in enumerate(scoreMatrix): + fFactor=0 + if score['fuzzyScore']>=fuzzyUpperLimit: + fFactor=fuzzyUpperFactor + elif score['fuzzyScore']=fuzzyLowerLimit: + fFactor=fuzzyLowerFactor + elif score['fuzzyScore']threshold: #This removes "unknow" from the results and weak results + resultMatrix.append(scoreMatrix[i]) +# else: + #print "Failed to pass the threshold unfortunatley!" + #print score['finaleScore'] + + #Sort the thing so the best score is top + resultMatrix=sorted(resultMatrix, key=lambda resultMatrix:resultMatrix['finaleScore'], reverse=True) + return resultMatrix + +#Simple Score mechanism for all subscores, returns a sorted list. Is Called after simplifyProcessedPscore +#def scoreComplex(scoreMatrix): + +#Calls all necessary steps to get a result for a termLabel +def scoreTermLabel(termLabel, targetOntology, params): + pscore=primaryScoreTerm('', termLabel, targetOntology) #Executes the basic calls to OLS and OXO, delievers primary score + pscore['sourceIRI']="UNKNOWN" + calculatedMappings=processPScore(pscore) #Process the primaryScore, weighting the primary results + calculatedMappings['sourceIRI']="UNKNOWN" + simplerMatrix=simplifyProcessedPscore(calculatedMappings) #Takes the processed input and combines the results line by line + singleLineResult=scoreSimple(simplerMatrix, params) #Takes simplified input and actually calculates the finale score + return singleLineResult + + +# Synonymsearch for comparing Ontologies in OLS, should be called instead score Simple for these cases +def scoreTermOLS(termIRI, termLabel, targetOntology, params): + pscore=primaryScoreTerm(termIRI, termLabel, targetOntology) + pscore['sourceIri']=termIRI + return pscore diff --git a/paxo/readme.txt b/paxo/readme.txt new file mode 100644 index 0000000..bed9168 --- /dev/null +++ b/paxo/readme.txt @@ -0,0 +1,27 @@ +Installation: + +Suggested in a virtualenv as always: +pip install -r requirements.txt + + + +Usage: + +Edit and run clientOperations.py: + +First create a raw score with + scoreOntologies(sourceOntology, targetOntology) + +Calculate a calculatedScore with: + calculatePrimaryScore(combinedOntologyName, params, writeToDisc) + +or calculate and validated a primary score with: + calculateAndValidateOntologyPrimaryScore(combinedOntologyName, stdName, stdFile, params, writeToDisc, parseParms) + + + +Prerequisite: + +1 A Folder “pipeline_output” has to be present +2 Within this folder there need to be a folder to put the calculated primary score (e.g. the folder “ordo_doid” containing the file “scoring_output_ordo_doid.csv”) +3 A path to a validation file if you want to validate against a std file \ No newline at end of file diff --git a/paxo/requirements.txt b/paxo/requirements.txt new file mode 100644 index 0000000..47c1216 --- /dev/null +++ b/paxo/requirements.txt @@ -0,0 +1,3 @@ +requests +python-levenshtein +flask diff --git a/paxo/validation.py b/paxo/validation.py new file mode 100644 index 0000000..e09f2f2 --- /dev/null +++ b/paxo/validation.py @@ -0,0 +1,165 @@ +import csv +import logging +import requests +import time + +url="https://www.ebi.ac.uk/ols/api/search" + +def validateFinaleScore(combinedOntologyName, stdNamed, inputFile, TargetFile, writeToDisc, params, parseParms): + uri1Position=parseParms['uri1'] + uri2Position=parseParms['uri2'] + counterPosition=parseParms['scorePosition'] + delimiterChar=parseParms['delimiter'] + + logging.basicConfig(filename="flask.log", level=logging.INFO, format='%(asctime)s - %(message)s') + + inputList=[] + inputLongList=[] + for row in inputFile: + inputList.append([row[0], row[1]]) + inputLongList.append(row) + + targetList=[] + targetLongList=[] + with open(TargetFile) as csvfile: + readCSV = csv.reader(csvfile, delimiter=str(delimiterChar)) + next(readCSV) + for row in readCSV: + targetList.append([row[uri1Position], row[uri2Position]]) + targetLongList.append(row) + + + missing=[] + matches=[] + + #Now validate the computed mappings against the standard + for counter, line in enumerate(targetList): + #NoMatch from std to the created mapping file, so this goes to the missing List + if line not in inputList: + missing.append([line[0], line[1], "NoScore", targetLongList[counter][counterPosition]]) + + #Exact same Result for both, so this is a match. Is added to the matches List + else: + for c in inputLongList: + if c[0]==line[0] and c[1]==line[1] or c[1]==line[0] and c[1]==line[1]: + score=c[2] + + matches.append([line[0], line[1], score, targetLongList[counter][counterPosition]]) + #Add those mappings that where no in the standard but calculated to the alternatives List + + alternatives=[] + for counter, line in enumerate(inputList): + if line not in targetList and line[1]!="UNKNOWN": + alternatives.append([line[0], line[1], inputLongList[counter][2], "noScore"]) + + + #Alternative Counter + alternativeCounter=0 + unrealMiss=[] + realMiss=[] + for sug in alternatives: + for miss in missing: + if sug[0]==miss[0] and sug[1]!=miss[1]: + alternativeCounter=alternativeCounter+1 + #print sug[0] +" mapped to "+sug[1]+" and "+miss[1] + unrealMiss.append(sug) + unrealMiss.append(miss) + #Real miss + else: + realMiss.append(miss) + + + result=matches+missing+alternatives#+discarted - we can also show the discarted terms or put them in an own file + + + #If we write to disc, I get the labels of the parts that are NOT mapped to the standard + if writeToDisc is True: + print "Try to save the result" + obsoleteScore=0 + for row in result: + #if row[2]=='NoScore' or row[3]=='noScore': + print "Need to annotate "+row[0]+" and "+row[1] + + data={'q':row[0],'queryFields':'iri', 'fieldList': 'label'} + + try: + r = requests.get(url, data) + except: + time.sleep(60) + logging.info("API exception, try again after 5 second delay") + print "API exception, try again after 5 second delay" + try: + r = requests.get(url, data) + logging.info("Success") + print "Success!" + except: + logging.info("Error with second try") + logging.info(r.status_code) + logging.info(r.request.url) + #raise + + jsonReply=r.json() + try: + row.append(jsonReply['response']['docs'][0]['label'].encode(encoding='UTF-8')) + except: + row.append('NoLabel Found') + obsoleteScore=obsoleteScore+1 + print "No Label found in the first row" + + data={'q':row[1],'queryFields':'iri', 'fieldList': 'label'} + try: + r = requests.get(url, data) + except: + time.sleep(60) + logging.info("API exception, try again after 5 second delay") + print "API exception, try again after 5 second delay" + try: + r = requests.get(url, data) + logging.info("Success") + print "Success" + except: + logging.info("Error with second try") + logging.info(r.status_code) + logging.info(r.request.url) + + jsonReply=r.json() + try: + row.append(jsonReply['response']['docs'][0]['label'].encode(encoding='UTF-8')) + except: + row.append('NoLabel Found') + obsoleteScore=obsoleteScore+1 + print "No Label found in the second row" + + + with open('pipeline_output/'+combinedOntologyName+'_'+stdNamed+'_validate.csv', 'wb') as f: + writer = csv.writer(f) + writer.writerows(result) + f.close() + + #Logging the stats of this validation + logging.info("ParameterSet for this validation run: ") + logging.info("threshold: "+str(params["threshold"])) + logging.info("fuzzyUpperLimit: "+str(params["fuzzyUpperLimit"])) + logging.info("fuzzyLowerLimit: "+str(params["fuzzyLowerLimit"])) + logging.info("fuzzyUpperFactor: "+str(params["fuzzyUpperFactor"])) + logging.info("fuzzyLowerFactor: "+str(params["fuzzyLowerFactor"])) + logging.info("oxoDistanceOne: "+str(params["oxoDistanceOne"])) + logging.info("oxoDistanceTwo: "+str(params["oxoDistanceTwo"])) + logging.info("oxoDistanceThree: "+str(params["oxoDistanceThree"])) + logging.info("synFuzzyFactor: "+str(params["synFuzzyFactor"])) + logging.info("synOxoFactor: "+str(params["synOxoFactor"])) + + logging.info("Stats for "+combinedOntologyName+" validation "+stdNamed) + logging.info("Number of std mappings:"+str(len(targetList))) + logging.info("Total Matches: "+str(len(matches))) + logging.info("Algorithm missed compared to std: "+str(len(missing))) + logging.info("Suspected Obsoleted Terms: "+str(obsoleteScore)) + logging.info("Total alternative terms suggested: "+str(len(alternatives))) + logging.info("AlternativeOverlappingWithMisses:"+str(alternativeCounter)+"\n") + #logging.info("NotMapped: "+str(len(discarted))+"\n") + + + #return result + return {"misses": len(missing), "alternatives": len(alternatives)} + #Return the parameters and the result of the validation. This shall be useful in the future + #return {"misses": len(missing), "alternativeCounter": alternativeCounter ,"params":runParams} From 4bec60a56f2c643abe0df1c310baf4df7167ba2a Mon Sep 17 00:00:00 2001 From: LLTommy Date: Tue, 21 Nov 2017 15:40:02 +0000 Subject: [PATCH 03/66] Little update of the readme --- paxo/readme.txt | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/paxo/readme.txt b/paxo/readme.txt index bed9168..bfbe984 100644 --- a/paxo/readme.txt +++ b/paxo/readme.txt @@ -1,27 +1,27 @@ -Installation: +### Installation Suggested in a virtualenv as always: -pip install -r requirements.txt +> pip install -r requirements.txt -Usage: +### Usage Edit and run clientOperations.py: First create a raw score with - scoreOntologies(sourceOntology, targetOntology) +> scoreOntologies(sourceOntology, targetOntology) Calculate a calculatedScore with: - calculatePrimaryScore(combinedOntologyName, params, writeToDisc) +> calculatePrimaryScore(combinedOntologyName, params, writeToDisc) or calculate and validated a primary score with: - calculateAndValidateOntologyPrimaryScore(combinedOntologyName, stdName, stdFile, params, writeToDisc, parseParms) +> calculateAndValidateOntologyPrimaryScore(combinedOntologyName, stdName, stdFile, params, writeToDisc, parseParms) -Prerequisite: +### Prerequisite: -1 A Folder “pipeline_output” has to be present -2 Within this folder there need to be a folder to put the calculated primary score (e.g. the folder “ordo_doid” containing the file “scoring_output_ordo_doid.csv”) -3 A path to a validation file if you want to validate against a std file \ No newline at end of file +* A Folder “pipeline_output” has to be present +* Within this folder there need to be a folder to put the calculated primary score (e.g. the folder “ordo_doid” containing the file “scoring_output_ordo_doid.csv”) +* A path to a validation file if you want to validate against a std file \ No newline at end of file From 431f1ff68a63374b986f8c328e9f02b98750bf40 Mon Sep 17 00:00:00 2001 From: LLTommy Date: Tue, 21 Nov 2017 15:44:41 +0000 Subject: [PATCH 04/66] Changing txt to md which is kind of important --- paxo/{readme.txt => readme.md} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename paxo/{readme.txt => readme.md} (100%) diff --git a/paxo/readme.txt b/paxo/readme.md similarity index 100% rename from paxo/readme.txt rename to paxo/readme.md From 19db7c20d2cdaaaf360813e25334cb28325917ba Mon Sep 17 00:00:00 2001 From: LLTommy Date: Mon, 11 Dec 2017 13:37:04 +0000 Subject: [PATCH 05/66] Mappings including input synonym score and exact labelboost --- paxo/clientOperations.py | 39 +++++++++++++++++++++++---------------- paxo/flaskMapping.py | 20 +++++++++++++++++--- paxo/requirements.txt | 3 +++ paxo/validation.py | 16 ++++++++++++++++ 4 files changed, 59 insertions(+), 19 deletions(-) diff --git a/paxo/clientOperations.py b/paxo/clientOperations.py index 10a1674..d92b81b 100644 --- a/paxo/clientOperations.py +++ b/paxo/clientOperations.py @@ -198,15 +198,17 @@ def curationOntologyFinalScore(scoredMatrix): #print scoredMatrix for counter, line in enumerate(scoredMatrix): print line - if line[1] not in endmap: endmap.append(line[1]) unified.append(line) else: - print "Double entry Found!!! Will replace now! " + #print "Double entry Found!!! Will replace now! " index=endmap.index(line[1]) if unified[index][2] boost synonym label hits + if "synonym" in reply.keys(): + for synonym in reply["synonym"]: + answerTerm=stringProcess(synonym.encode(encoding='UTF-8')) + tmpLev=round(Levenshtein.ratio(termLabel, answerTerm), 5) + if tmpLev>lev: + lev=tmpLev + levList.append({"SourceLabel": termLabel, "SourceIRI": termLabel , "TargetIRI": reply['iri'], "TargetLabel": reply['label'], "lev":lev}) - except: + + except Exception as e: + print e print "ERROR WITH LEV Distance, score 0 for now for these two" print reply print termLabel @@ -311,7 +323,9 @@ def scoreSimple(scoreMatrix, params): resultMatrix=[] for i,score in enumerate(scoreMatrix): fFactor=0 - if score['fuzzyScore']>=fuzzyUpperLimit: + if score['fuzzyScore']==1: #Exact match, we shall boost this by all means, so we take UpperFactor*2 for now + fFactor=fuzzyUpperFactor*2 + elif score['fuzzyScore']>=fuzzyUpperLimit: fFactor=fuzzyUpperFactor elif score['fuzzyScore']=fuzzyLowerLimit: fFactor=fuzzyLowerFactor diff --git a/paxo/requirements.txt b/paxo/requirements.txt index 47c1216..4b36ac6 100644 --- a/paxo/requirements.txt +++ b/paxo/requirements.txt @@ -1,3 +1,6 @@ requests python-levenshtein flask +spotpy +numpy +matplotlib \ No newline at end of file diff --git a/paxo/validation.py b/paxo/validation.py index e09f2f2..90f5761 100644 --- a/paxo/validation.py +++ b/paxo/validation.py @@ -18,6 +18,9 @@ def validateFinaleScore(combinedOntologyName, stdNamed, inputFile, TargetFile, w for row in inputFile: inputList.append([row[0], row[1]]) inputLongList.append(row) + #if row[2]=='': + # print "Oh No, we found a empty value!" + targetList=[] targetLongList=[] @@ -32,10 +35,16 @@ def validateFinaleScore(combinedOntologyName, stdNamed, inputFile, TargetFile, w missing=[] matches=[] + #print inputList + #Now validate the computed mappings against the standard for counter, line in enumerate(targetList): #NoMatch from std to the created mapping file, so this goes to the missing List if line not in inputList: + #if line[0]=="http://purl.obolibrary.org/obo/HP_0009059" or line[0]=="http://purl.obolibrary.org/obo/HP_0025247": + #print line + #print targetLongList[counter] + missing.append([line[0], line[1], "NoScore", targetLongList[counter][counterPosition]]) #Exact same Result for both, so this is a match. Is added to the matches List @@ -72,6 +81,13 @@ def validateFinaleScore(combinedOntologyName, stdNamed, inputFile, TargetFile, w result=matches+missing+alternatives#+discarted - we can also show the discarted terms or put them in an own file + #print "unrealMiss" + #print unrealMiss + #print "Real Miss" + #print realMiss + #print "Result" + #print result + #If we write to disc, I get the labels of the parts that are NOT mapped to the standard if writeToDisc is True: print "Try to save the result" From 9b231c83fd2aad393ab8be40509c20e2b8fc4e4d Mon Sep 17 00:00:00 2001 From: LLTommy Date: Mon, 18 Dec 2017 14:20:53 +0100 Subject: [PATCH 06/66] Includes the new string matcher in the algorithm --- paxo/clientOperations.py | 166 +++++++++++++++++++++---------- paxo/flaskMapping.py | 206 ++++++++++++++++++++++++++++++--------- paxo/requirements.txt | 3 +- paxo/validation.py | 73 +++++++------- 4 files changed, 308 insertions(+), 140 deletions(-) diff --git a/paxo/clientOperations.py b/paxo/clientOperations.py index d92b81b..2d818eb 100644 --- a/paxo/clientOperations.py +++ b/paxo/clientOperations.py @@ -9,9 +9,10 @@ import ast url="https://www.ebi.ac.uk/ols/api/" +#url="http://snarf.ebi.ac.uk:8980/ols-beta/api" #Compares to ontologies from the OLS. This process can take a while and procudes a csv with primary results -def scoreOntologies(sourceOntology, targetOntology): +def scoreOntologies(sourceOntology, targetOntology, scoreParams): logging.info("Start scoring "+sourceOntology+" and "+targetOntology) #Check for the smaller ontology r = requests.get(url+"ontologies/"+sourceOntology) @@ -25,7 +26,7 @@ def scoreOntologies(sourceOntology, targetOntology): sourceOntology=targetOntology targetOntology=tmpOntology - termsUrl=url+"ontologies/"+sourceOntology+"/terms?size=500&fieldList=iri,label,synonym" + termsUrl=url+"ontologies/"+sourceOntology+"/terms?size=100&fieldList=iri,label,synonym" results=[] results.append(["sourceLabel","sourceIRI", "fuzzy", "oxo", "synFuzzy", "synOxo", "bridgeTerms"]) @@ -47,8 +48,9 @@ def scoreOntologies(sourceOntology, targetOntology): #Check if the term is actually defined in that ontology if term['is_defining_ontology'] is True: + pscore=flaskMapping.scoreTermOLS(term["iri"], originalLabel, targetOntology, scoreParams) try: - pscore=flaskMapping.scoreTermOLS(term["iri"], originalLabel, targetOntology, {}) + calculatedMappings=flaskMapping.processPScore(pscore) except Exception as e: print "Exception in primary Scoring" @@ -65,7 +67,7 @@ def scoreOntologies(sourceOntology, targetOntology): if synonyms!=None: for synonym in synonyms: try: - synPscore=flaskMapping.primaryScoreTerm('', synonym, targetOntology) + synPscore=flaskMapping.primaryScoreTerm('', synonym, targetOntology, scoreParams) synCalculatedMappings=flaskMapping.processPScore(synPscore) #Process the primaryScore for synonyms synCalculatedMappings['sourceIRI']=term["iri"] except Exception as e: @@ -92,7 +94,7 @@ def scoreOntologies(sourceOntology, targetOntology): if counter%2==0: print "Processed "+str(counter)+" pages" logging.info("Processed "+str(counter)+" pages") - #break # Not necessary if we want to parse whole ontology, just activate this for testing + break # Not necessary if we want to parse whole ontology, just activate this for testing except: logging.info("Reached last page I recon") print "Reached last page I recon" @@ -121,21 +123,27 @@ def scoreOntologyPrimaryScore(name): synFuzzy=ast.literal_eval(tmp) tmp=row[5] synOxo=ast.literal_eval(tmp) + tmp=row[6] + bridgeEvidence=ast.literal_eval(tmp) for i in fuzzy: - obj={"sourceTerm":originalLabel, "sourceIRI":orginaliri ,"iri": i['fuzzyIri'], "olsFuzzyScore": fuzzy, "oxoScore": oxo, "synFuzzy": synFuzzy, "synOxo":synOxo} + obj={"sourceTerm":originalLabel, "sourceIRI":orginaliri ,"iri": i['fuzzyIri'], "olsFuzzyScore": fuzzy, "oxoScore": oxo, "synFuzzy": synFuzzy, "synOxo":synOxo, "bridgeEvidence":bridgeEvidence} scoreMatrix.append(obj) for i in oxo: - obj={"sourceTerm":originalLabel, "sourceIRI":orginaliri, "iri": i['oxoCurie'], "olsFuzzyScore": fuzzy, "oxoScore": oxo, "synFuzzy": synFuzzy, "synOxo":synOxo} + obj={"sourceTerm":originalLabel, "sourceIRI":orginaliri, "iri": i['oxoCurie'], "olsFuzzyScore": fuzzy, "oxoScore": oxo, "synFuzzy": synFuzzy, "synOxo":synOxo, "bridgeEvidence":bridgeEvidence} scoreMatrix.append(obj) for i in synFuzzy: - obj={"sourceTerm":originalLabel, "sourceIRI":orginaliri, "iri": i['fuzzyIri'], "olsFuzzyScore": fuzzy, "oxoScore": oxo, "synFuzzy": synFuzzy, "synOxo":synOxo} + obj={"sourceTerm":originalLabel, "sourceIRI":orginaliri, "iri": i['fuzzyIri'], "olsFuzzyScore": fuzzy, "oxoScore": oxo, "synFuzzy": synFuzzy, "synOxo":synOxo, "bridgeEvidence":bridgeEvidence} scoreMatrix.append(obj) for i in synOxo: - obj={"sourceTerm":originalLabel, "sourceIRI":orginaliri, "iri": i['oxoCurie'], "olsFuzzyScore": fuzzy, "oxoScore": oxo, "synFuzzy": synFuzzy, "synOxo":synOxo} + obj={"sourceTerm":originalLabel, "sourceIRI":orginaliri, "iri": i['oxoCurie'], "olsFuzzyScore": fuzzy, "oxoScore": oxo, "synFuzzy": synFuzzy, "synOxo":synOxo, "bridgeEvidence":bridgeEvidence} + scoreMatrix.append(obj) + + for i in bridgeEvidence: + obj={"sourceTerm":originalLabel, "sourceIRI":orginaliri, "iri": i['oxoCurie'], "olsFuzzyScore": fuzzy, "oxoScore": oxo, "synFuzzy": synFuzzy, "synOxo":synOxo, "bridgeEvidence":bridgeEvidence} scoreMatrix.append(obj) @@ -196,39 +204,54 @@ def curationOntologyFinalScore(scoredMatrix): endmap=[] unified=[] #print scoredMatrix + doubleEntryCounter=0 + replacedList=[] for counter, line in enumerate(scoredMatrix): - print line + #print line if line[1] not in endmap: endmap.append(line[1]) unified.append(line) else: - #print "Double entry Found!!! Will replace now! " + #print "Double entry Found!!!" + doubleEntryCounter=doubleEntryCounter+1 index=endmap.index(line[1]) + #print unified[index] + #print scoredMatrix[counter] if unified[index][2] -175? +#Best run at 39 of 50 (best like=-175) with parameter set: +#[ 5.17002538e-05 9.97186358e-01 5.14350766e-03 5.08109418e-01 +# 9.98062574e-01 9.11663359e-02 4.59066160e-01 6.72066822e-01 +# 4.07275455e-01] + +#params={"fuzzyUpperLimit": 0.00005, "fuzzyLowerLimit": 0.99,"fuzzyUpperFactor": 0.5,"fuzzyLowerFactor":0.99, "oxoDistanceOne":0.09, "oxoDistanceTwo":0.459, "oxoDistanceThree":0.67, "synFuzzyFactor":0.41, "synOxoFactor": 0.38, "threshold":0.6} + + +#params={"fuzzyUpperLimit": 0.8, "fuzzyLowerLimit": 0.6,"fuzzyUpperFactor": 1,"fuzzyLowerFactor":0.6, "oxoDistanceOne":1, "oxoDistanceTwo":0.3, "oxoDistanceThree":0.1, "synFuzzyFactor":1, "synOxoFactor": 0.4, "threshold":0.6} + ###Score all Ontologies in the config file #scoreListOntologies(sections) +#config.get() +#oxoURL=config.get("replacements","oxoURL") +#print list(config.items('replacements')) + +removeStopwordsList=['of', 'the'] +replaceTermList=[('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('abnormality','disease')] +scoreParams={"removeStopwordsList": removeStopwordsList, "replaceTermList" :replaceTermList} + +hp_doid_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : [('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('abnormality','disease')]} + ### Primary score ontologies -#scoreOntologies("ordo","hp") -#scoreOntologies("doid","mp") -#scoreOntologies("doid","ordo") -#scoreOntologies("hp","doid") -#scoreOntologies("hp","mp") -#scoreOntologies("ordo","mp") +ordo_hp_scoreParams={"removeStopwordsList": ['of', 'the', 'Rare'], "replaceTermList" : [('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('tumor', 'neoplasm'), ('tumor','cancer'), ('abnormality', 'disease'), ('decreased', 'reduced'), ('morphology', '')]} +scoreOntologies("ordo","hp", ordo_hp_scoreParams) + +doid_ordo_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} +scoreOntologies("doid","mp", doid_ordo_scoreParams) +# +doid_ordo_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} +scoreOntologies("doid","ordo", doid_ordo_scoreParams) +# +hp_doid_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : [('cancer', 'neoplasm'), ('cancer','carcinoma'), ('abnormality','disease'), 'abnormality','disease']} +scoreOntologies("hp","doid",hp_doid_scoreParams) +# +hp_mp_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : [('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('abnormality','disease'), ('abnormal','Abnormality')]} +scoreOntologies("hp","mp", hp_mp_scoreParams) +# +ordo_mp_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} +scoreOntologies("ordo","mp", ordo_mp_scoreParams) + + #scoreOntologies("mesh","hp") #scoreOntologies("mesh","doid") @@ -277,31 +334,34 @@ def calculateAndValidateListOntologies(sections): #scoreOntologies("mesh","mp") +writeToDisc=False +uniqueMaps=False ### Execute Calculate and validate for a certain file -#calculateAndValidateOntologyPrimaryScore('hp_doid', 'loom', 'Loom/DOID_HP_loom.csv', params, True, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}) -#calculateAndValidateOntologyPrimaryScore('hp_doid', 'silver','silver_nov/Consensus-3-hp-doid.tsv', params, True, {'uri1':0, 'uri2':2, 'scorePosition':4 , 'delimiter':'\t'}) -###calculateAndValidateOntologyPrimaryScore('ordo_hp', 'loom', 'Loom/ordo_hp_loom.csv', params, True, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}) -#print calculateAndValidateOntologyPrimaryScore('ordo_hp', 'silver','silver_nov/Consensus-3-hp-ordo.tsv', params, False, {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}) - -#calculateAndValidateOntologyPrimaryScore('mp_hp', 'loom','Loom/MP_HP_loom.csv', params, True, {'uri1':0, 'uri2':1, 'scorePosition':2 , 'delimiter':','}) -#calculateAndValidateOntologyPrimaryScore('mp_hp', 'silver','silver_nov/Consensus-3-hp-mp.tsv', params, True, {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}) -#calculateAndValidateOntologyPrimaryScore('ordo_doid', 'loom' ,'Loom/DOID_ORDO_loom.csv', params, True, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}) -#calculateAndValidateOntologyPrimaryScore('ordo_doid', 'silver','silver_nov/Consensus-3-doid-ordo.tsv', params, True, {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}) -#calculateAndValidateOntologyPrimaryScore('ordo_mp', 'loom', 'Loom/mp_ordo_loom.csv', params, True, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}) -#calculateAndValidateOntologyPrimaryScore('ordo_mp', 'silver','silver_nov/Consensus-3-mp-ordo.tsv', params, True, {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}) -###calculateAndValidateOntologyPrimaryScore('mp_doid', 'loom', 'Loom/DOID_MP_loom.csv', params, True, {'uri1':1, 'uri2':0, 'scorePosition':2, 'delimiter':','}) -###calculateAndValidateOntologyPrimaryScore('mp_doid', 'silver','silver_nov/Consensus-3-mp-doid.tsv', params, True, {'uri1':0, 'uri2':2, 'scorePosition':4 , 'delimiter':'\t'}) - - -#calculateAndValidateOntologyPrimaryScore('mesh_doid', 'loom', 'Loom/DOID_MESH_loom_new.csv', params, True, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}) -#calculateAndValidateOntologyPrimaryScore('mesh_hp', 'loom', 'Loom/mesh_hp_loom_new.csv', params, True, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}) -#calculateAndValidateOntologyPrimaryScore('mesh_mp', 'loom', 'Loom/mesh_mp_loom_new.csv', params, True, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}) - +# print calculateAndValidateOntologyPrimaryScore('hp', 'doid', 'loom', 'Loom/DOID_HP_loom.csv', params, writeToDisc, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps) +# print calculateAndValidateOntologyPrimaryScore('hp','doid', 'silver','silver_nov/Consensus-3-hp-doid.tsv', params, writeToDisc, {'uri1':0, 'uri2':2, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps) +# print calculateAndValidateOntologyPrimaryScore('ordo', 'hp', 'loom', 'Loom/ordo_hp_loom.csv', params, writeToDisc, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps) +# print calculateAndValidateOntologyPrimaryScore('ordo', 'hp', 'silver','silver_nov/Consensus-3-hp-ordo.tsv', params, writeToDisc, {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps) +# # +# print calculateAndValidateOntologyPrimaryScore('mp','hp', 'loom','Loom/MP_HP_loom.csv', params, writeToDisc, {'uri1':0, 'uri2':1, 'scorePosition':2 , 'delimiter':','}, uniqueMaps) +# print calculateAndValidateOntologyPrimaryScore('mp','hp', 'silver','silver_nov/Consensus-3-hp-mp.tsv', params, writeToDisc, {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps) +# print calculateAndValidateOntologyPrimaryScore('ordo','doid', 'loom' ,'Loom/DOID_ORDO_loom.csv', params, writeToDisc, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps) +# print calculateAndValidateOntologyPrimaryScore('ordo','doid', 'silver','silver_nov/Consensus-3-doid-ordo.tsv', params, writeToDisc, {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps) +# print calculateAndValidateOntologyPrimaryScore('ordo','mp', 'loom', 'Loom/mp_ordo_loom.csv', params, writeToDisc, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps) +# print calculateAndValidateOntologyPrimaryScore('ordo','mp', 'silver','silver_nov/Consensus-3-mp-ordo.tsv', params, writeToDisc, {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps) +# print calculateAndValidateOntologyPrimaryScore('mp','doid', 'loom', 'Loom/DOID_MP_loom.csv', params, writeToDisc, {'uri1':1, 'uri2':0, 'scorePosition':2, 'delimiter':','}, uniqueMaps) +# print calculateAndValidateOntologyPrimaryScore('mp','doid', 'silver','silver_nov/Consensus-3-mp-doid.tsv', params, writeToDisc, {'uri1':0, 'uri2':2, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps) +# # +# +# print calculateAndValidateOntologyPrimaryScore('mesh','doid', 'loom', 'Loom/DOID_MESH_loom_new.csv', params, writeToDisc, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps) +# print calculateAndValidateOntologyPrimaryScore('mesh','doid', 'silver', 'silver_nov/Consensus-3-doid-mesh3.tsv', params, writeToDisc, {'uri1':2, 'uri2':0, 'scorePosition':2, 'delimiter':'\t'}, uniqueMaps) +# print calculateAndValidateOntologyPrimaryScore('mesh','hp', 'loom', 'Loom/mesh_hp_loom_new.csv', params, writeToDisc, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps) +# print calculateAndValidateOntologyPrimaryScore('mesh','doid', 'silver', 'silver_nov/Consensus-3-doid-mesh3.tsv', params, writeToDisc, {'uri1':1, 'uri2':0, 'scorePosition':2, 'delimiter':'\t'}, uniqueMaps) +# print calculateAndValidateOntologyPrimaryScore('mesh','mp', 'loom', 'Loom/mesh_mp_loom_new.csv', params, writeToDisc, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps) +# print calculateAndValidateOntologyPrimaryScore('mesh','doid', 'silver', 'silver_nov/Consensus-3-doid-mesh3.tsv', params, writeToDisc, {'uri1':1, 'uri2':0, 'scorePosition':2, 'delimiter':'\t'}, uniqueMaps) ###Execute functions for terms (#Broken since last change?) -#print flaskMapping.scoreTermLabel("Cavernous hemangioma", "doid", params) -#print flaskMapping.scoreTermLabel("Osteochondritis dissecans and short stature", "hp", params) -#print scoreTermList(["Stroke","disease", "heartattack"], "ordo", params) +# --> needs to be invesitgate! +#print flaskMapping.scoreTermLabel("Nuclear cataract", "doid", params) ###Calculate the score from the primary file and run it against the validation files. For all Ontologies in the config file #calculateAndValidateListOntologies(sections) diff --git a/paxo/flaskMapping.py b/paxo/flaskMapping.py index c165774..8d8f2e1 100644 --- a/paxo/flaskMapping.py +++ b/paxo/flaskMapping.py @@ -19,9 +19,6 @@ logging.basicConfig(filename="flask.log", level=logging.INFO, format='%(asctime)s - %(message)s') -#List of terms that should be cut out of label before fuzzy match # Shall come from config file later -cutList=["abnormalityof", "syndrome", "disease", "cancer", "tumor"] - @app.route("/") @@ -91,48 +88,138 @@ def oxoMatch(termLabel, targetOntology): return [{"curie":"UNKNOWN", "distance": 0}] -def stringProcess(term): - processedTerm=term.lower() #Make sting lower case - processedTerm=processedTerm.replace(" ", "") #Remove all spaces - #Simply cut some things from the label before calculating the levenstein distance - for cut in cutList: - tmpArray=term.split(cut) #Remove problematic terms - if len(tmpArray[0])!=0: - processedTerm=tmpArray[0] - break - elif len(tmpArray[1])!=0: - processedTerm=tmpArray[1] - break - else: - print "Something is wrong" - break +############################################################ +##List of terms that should be cut out of label before fuzzy match # Shall come from config file later +#cutList=["abnormalityof", "syndrome", "disease", "cancer", "tumor", "abnormal"] + +# def stringProcess(term): +# processedTerm=term.lower() #Make sting lower case +# processedTerm=processedTerm.replace(" ", "") #Remove all spaces +# +# #Simply cut some things from the label before calculating the levenstein distance +# for cut in cutList: +# tmpArray=term.split(cut) #Remove problematic terms +# if len(tmpArray[0])!=0: +# processedTerm=tmpArray[0] +# break +# elif len(tmpArray[1])!=0: +# processedTerm=tmpArray[1] +# break +# else: +# print "Something is wrong" +# break +# +# return processedTerm + +############################################################ +#removeStopwordsList=['of', 'the'] +#replaceTermList=[('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('abnormality','disease')] + +def sortWords(term): + term=term.lower() + StringList=term.split(' ') + StringList.sort() + term=' '.join(StringList) + return term + +def stringMatcher(sourceTerm, targetTerm, replaceTermList, removeStopwordsList): + #First calculate Lev without changes + lev=round(Levenshtein.ratio(sourceTerm, targetTerm), 5) + #print "Straight Lev: "+sourceTerm+" - "+targetTerm+" --> "+str(lev) + + sourceTerm=sortWords(sourceTerm) + targetTerm=sortWords(targetTerm) + + + replacementLev=round(Levenshtein.ratio(sourceTerm, targetTerm), 5) + #print "Sorted Lev: "+sourceTerm+" - "+targetTerm+" --> "+str(replacementLev) + if replacementLev>lev: + lev=replacementLev + #print " Score Higher, so replaced" + + #Remove stop words + for stop in removeStopwordsList: + sourceTerm=sourceTerm.replace(stop,'').strip().replace(' ', ' ') + targetTerm=targetTerm.replace(stop, '').strip().replace(' ', ' ') + + #print "Removed Stopwords Lev: "+sourceTerm+" - "+targetTerm+" --> "+str(round(Levenshtein.ratio(sourceTerm, targetTerm), 5)) + + #print "SourceReplacements:" + #Replace terms in source to trying to find higher score + for replacement in replaceTermList: + tmpSource=sourceTerm.replace(replacement[0], replacement[1]) + tmpSource=sortWords(tmpSource) + replacementLev=round(Levenshtein.ratio(tmpSource, targetTerm), 5) + #print " Replacements: "+tmpSource+" - "+targetTerm+" --> "+str(replacementLev) + if replacementLev>lev: + lev=replacementLev + #print " Score Higher, so replaced" + +# tmpSource=tmpSource.split(' ') +# tmpSource.sort() +# tmpSource=' '.join(tmpSource) +# replacementLev=round(Levenshtein.ratio(tmpSource, targetTerm), 5) +# print " Replacements Source: "+tmpSource+" - "+targetTerm+" --> "+str(replacementLev) +# if replacementLev>lev: +# lev=replacementLev + + #print "TargetReplacements:" + #Replace terms in target to trying to find higher score + for replacement in replaceTermList: + tmpTarget=targetTerm.replace(replacement[0], replacement[1]) + tmpTarget=sortWords(tmpTarget) + replacementLev=round(Levenshtein.ratio(sourceTerm, tmpTarget), 5) + #print " Replacements: "+sourceTerm+" - "+tmpTarget+" --> "+str(replacementLev) + if replacementLev>lev: + lev=replacementLev + #print " Score Higher, so replaced" + + return lev +############################################################ + + + + + + + + - return processedTerm #Takes an input label and executes the fuzzyOLS call -def olsFuzzyMatch(termLabel, targetOntology): +def olsFuzzyMatch(termLabel, targetOntology, replaceTermList, removeStopwordsList): data={"q":termLabel, "ontology":targetOntology, "type":"class", "local":True, "fieldList":"label,iri,synonym"} jsonReply=apiCall(searchURL, data) termLabel=termLabel.encode(encoding='UTF-8') - termLabel=stringProcess(termLabel) + #stringProcess(termLabel) #WE found at least 1 hit - jsonReply=jsonReply.json()['response'] + try: + jsonReply=jsonReply.json()['response'] + except: + print "Error with deoding jsonReply from OLS api call!" + print jsonReply + if jsonReply['numFound']>0: levList=[] for reply in jsonReply['docs']: try: - answerTerm=stringProcess(reply['label'].encode(encoding='UTF-8')) - lev=round(Levenshtein.ratio(termLabel, answerTerm), 5) + #answerTerm=stringProcess(reply['label'].encode(encoding='UTF-8')) + answerTerm=reply['label'].encode(encoding='UTF-8') + + #lev=round(Levenshtein.ratio(termLabel, answerTerm), 5) + lev=stringMatcher(termLabel, answerTerm, replaceTermList, removeStopwordsList) #Compare the inputLabel with all synonym Labels as well. #If lev score is higher for a synonym, replace lev score --> boost synonym label hits if "synonym" in reply.keys(): for synonym in reply["synonym"]: - answerTerm=stringProcess(synonym.encode(encoding='UTF-8')) - tmpLev=round(Levenshtein.ratio(termLabel, answerTerm), 5) + #answerTerm=stringProcess(synonym.encode(encoding='UTF-8')) + answerTerm=synonym.encode(encoding='UTF-8') + #tmpLev=round(Levenshtein.ratio(termLabel, answerTerm), 5) + tmpLev=stringMatcher(termLabel, answerTerm, replaceTermList, removeStopwordsList) if tmpLev>lev: lev=tmpLev @@ -155,7 +242,14 @@ def olsFuzzyMatch(termLabel, targetOntology): ##Now let's relax The fuzzy search and aim for other (all) ontologies data={"q":termLabel, "type":"class", "local":True, "limit":30} jsonReply=apiCall(searchURL, data) - jsonReply=jsonReply.json()['response'] + try: + jsonReply=jsonReply.json()['response'] + except: + print "Error with decoding jsonReply from RELAXED OLS api call!" + print jsonReply + + + #jsonReply=jsonReply.json()['response'] oxoTargetList=[] if jsonReply['numFound']>0: for reply in jsonReply['docs']: @@ -165,8 +259,11 @@ def olsFuzzyMatch(termLabel, targetOntology): return {"fuzzyTerms": sortedLev, "bridgeTerms": oxoTargetList} #Executes the basic calls, delievers primary score (raw scoring) -def primaryScoreTerm(termIRI, termLabel, targetOntology): - olsFuzzyResult=olsFuzzyMatch(termLabel, targetOntology) +def primaryScoreTerm(termIRI, termLabel, targetOntology, scoreParams): + replaceTermList=scoreParams["replaceTermList"] + removeStopwordsList=scoreParams["removeStopwordsList"] + + olsFuzzyResult=olsFuzzyMatch(termLabel, targetOntology, replaceTermList, removeStopwordsList) if termIRI!='': oxoResults=oxoMatch(termIRI, targetOntology) @@ -176,17 +273,15 @@ def primaryScoreTerm(termIRI, termLabel, targetOntology): bridgeTerms=olsFuzzyResult['bridgeTerms'] olsFuzzyResult=olsFuzzyResult['fuzzyTerms'] - logging.info("On the search for this fk bridge Terms") - logging.info(bridgeTerms) + #if bridgeTerms!=[]: + #print "Found bridge terms, it is incredible!" + bridgeOxo=[] if len(bridgeTerms)>0: for bridgeTerm in bridgeTerms: tmp=oxoMatch(bridgeTerm['short_form'],targetOntology) - for line in tmp: if line['curie']!='UNKNOWN': - #print "FOUND a bridge OxO thing! Finally!" - #logging.info("FOUND a bridge OxO thing! Finally!") bridgeOxo.append(tmp) else: bridgeOxo=[[{"curie":"UNKNOWN", "distance": 0}]] @@ -210,7 +305,6 @@ def primaryScoreTerm(termIRI, termLabel, targetOntology): def processPScore(pScore): mapping={'sourceTerm':pScore['sourceTerm'], "olsFuzzyScore": [], "oxoScore": [], "bridgeEvidence": []} - for fuzzy in pScore['olsFuzzyScore']: mapping['olsFuzzyScore'].append({'fuzzyMapping': fuzzy['TargetLabel'], 'fuzzyIri':fuzzy['TargetIRI'],'fuzzyScore': fuzzy['lev']}) @@ -226,12 +320,13 @@ def processPScore(pScore): #Is here there right place to do it? Unsure at the moment for oxo in pScore['bridgeEvidence']: tmpCurie=oxo['curie'] - oxoScore=int(oxo['distance']) + bridgeOxoScore=int(oxo['distance']) if int(oxo['distance'])==0: tmpCurie="UNKNOWN" - mapping['bridgeEvidence'].append({'oxoCurie':tmpCurie, "distance": oxo['distance'], "oxoScore":oxoScore}) + mapping['bridgeEvidence'].append({'oxoCurie':tmpCurie, "distance": oxo['distance'], "oxoScore":bridgeOxoScore}) + return mapping @@ -241,11 +336,10 @@ def simplifyProcessedPscore(mapping): sourceIRI=mapping['sourceIRI'] flag=False - for line in mapping['olsFuzzyScore']: if line['fuzzyScore']==[]: line['fuzzyScore']=0 - obj={"sourceTerm":mapping['sourceTerm'], "sourceIRI":sourceIRI,"iri":line['fuzzyIri'], "fuzzyScore": line['fuzzyScore'], "oxoScore": 0, "synFuzzy":0, "synOxo": 0} + obj={"sourceTerm":mapping['sourceTerm'], "sourceIRI":sourceIRI,"iri":line['fuzzyIri'], "fuzzyScore": line['fuzzyScore'], "oxoScore": 0, "synFuzzy":0, "synOxo": 0, "bridgeOxoScore":0} scoreMatrix.append(obj) flag=False @@ -256,7 +350,7 @@ def simplifyProcessedPscore(mapping): flag=True if flag==False: - obj={"sourceTerm":mapping['sourceTerm'], "sourceIRI":sourceIRI, "iri":line['oxoCurie'], "fuzzyScore": 0, "oxoScore": line['oxoScore'], "synFuzzy":0, "synOxo": 0} + obj={"sourceTerm":mapping['sourceTerm'], "sourceIRI":sourceIRI, "iri":line['oxoCurie'], "fuzzyScore": 0, "oxoScore": line['oxoScore'], "synFuzzy":0, "synOxo": 0, "bridgeOxoScore":0} scoreMatrix.append(obj) # Starting here we try to take care of synonyms! @@ -272,7 +366,7 @@ def simplifyProcessedPscore(mapping): flag=True if flag==False: - obj={"sourceTerm":mapping['sourceTerm'], "sourceIRI":sourceIRI,"iri":line['fuzzyIri'], "fuzzyScore":0, "oxoScore": 0, "synFuzzy": line['fuzzyScore'], "synOxo":0} + obj={"sourceTerm":mapping['sourceTerm'], "sourceIRI":sourceIRI,"iri":line['fuzzyIri'], "fuzzyScore":0, "oxoScore": 0, "synFuzzy": line['fuzzyScore'], "synOxo":0, "bridgeOxoScore":0} scoreMatrix.append(obj) # Oxo Synonyms Score @@ -285,13 +379,27 @@ def simplifyProcessedPscore(mapping): flag=True if flag==False: - obj={"sourceTerm":mapping['sourceTerm'], "sourceIRI":sourceIRI, "iri":line['oxoCurie'], "fuzzyScore": 0, "oxoScore": 0, "synFuzzy":0, "synOxo": line['oxoScore']} + obj={"sourceTerm":mapping['sourceTerm'], "sourceIRI":sourceIRI, "iri":line['oxoCurie'], "fuzzyScore": 0, "oxoScore": 0, "synFuzzy":0, "synOxo": line['oxoScore'], "bridgeOxoScore":0} scoreMatrix.append(obj) else: print "No Synonyms here" + + #Getting into bridge evidence + flag=False + for line in mapping['bridgeEvidence']: + for s in scoreMatrix: + if line["oxoCurie"]==s["iri"]: + s['oxoScore']=line['oxoScore'] + flag=True + + if flag==False: + obj={"sourceTerm":mapping['sourceTerm'], "sourceIRI":sourceIRI, "iri":line['bridgeOxoCurie'], "fuzzyScore": 0, "oxoScore": 0, "synFuzzy":0, "synOxo": 0, "bridgeOxoScore": line['oxoScore']} + scoreMatrix.append(obj) + + #print "made it to the end of simplifyProcessedPscore" # Should do this bridgy thing here #if mapping['bridgeEvidence']==None @@ -324,7 +432,7 @@ def scoreSimple(scoreMatrix, params): for i,score in enumerate(scoreMatrix): fFactor=0 if score['fuzzyScore']==1: #Exact match, we shall boost this by all means, so we take UpperFactor*2 for now - fFactor=fuzzyUpperFactor*2 + fFactor=2 elif score['fuzzyScore']>=fuzzyUpperLimit: fFactor=fuzzyUpperFactor elif score['fuzzyScore']=fuzzyLowerLimit: @@ -340,7 +448,13 @@ def scoreSimple(scoreMatrix, params): if score['oxoScore']==3: score['oxoScore']=oxoDistanceThree - score['finaleScore']=score['fuzzyScore']*fFactor+score['oxoScore']+score['synFuzzy']*synFuzzyFactor+score['synOxo']*synOxoFactor + + if score['bridgeOxoScore']>0: + print "FOUND an incredible bridge Term, uhauha!" + print scoreMatrix[i] + + + score['finaleScore']=score['fuzzyScore']*fFactor+score['oxoScore']+score['synFuzzy']*synFuzzyFactor+score['synOxo']*synOxoFactor+score['bridgeOxoScore'] ### Do we want unknown to be printed if score['finaleScore']>threshold: #This removes "unknow" from the results and weak results @@ -358,7 +472,7 @@ def scoreSimple(scoreMatrix, params): #Calls all necessary steps to get a result for a termLabel def scoreTermLabel(termLabel, targetOntology, params): - pscore=primaryScoreTerm('', termLabel, targetOntology) #Executes the basic calls to OLS and OXO, delievers primary score + pscore=primaryScoreTerm('', termLabel, targetOntology, params) #Executes the basic calls to OLS and OXO, delievers primary score pscore['sourceIRI']="UNKNOWN" calculatedMappings=processPScore(pscore) #Process the primaryScore, weighting the primary results calculatedMappings['sourceIRI']="UNKNOWN" @@ -369,6 +483,6 @@ def scoreTermLabel(termLabel, targetOntology, params): # Synonymsearch for comparing Ontologies in OLS, should be called instead score Simple for these cases def scoreTermOLS(termIRI, termLabel, targetOntology, params): - pscore=primaryScoreTerm(termIRI, termLabel, targetOntology) + pscore=primaryScoreTerm(termIRI, termLabel, targetOntology, params) pscore['sourceIri']=termIRI return pscore diff --git a/paxo/requirements.txt b/paxo/requirements.txt index 4b36ac6..90934b7 100644 --- a/paxo/requirements.txt +++ b/paxo/requirements.txt @@ -3,4 +3,5 @@ python-levenshtein flask spotpy numpy -matplotlib \ No newline at end of file +matplotlib +pandas \ No newline at end of file diff --git a/paxo/validation.py b/paxo/validation.py index 90f5761..b80914e 100644 --- a/paxo/validation.py +++ b/paxo/validation.py @@ -3,9 +3,10 @@ import requests import time -url="https://www.ebi.ac.uk/ols/api/search" +#url="https://www.ebi.ac.uk/ols/api/search" +url="http://snarf.ebi.ac.uk:8980/ols-beta/api/search" -def validateFinaleScore(combinedOntologyName, stdNamed, inputFile, TargetFile, writeToDisc, params, parseParms): +def validateFinaleScore(onto1, onto2, stdNamed, inputFile, TargetFile, writeToDisc, params, parseParms): uri1Position=parseParms['uri1'] uri2Position=parseParms['uri2'] counterPosition=parseParms['scorePosition'] @@ -41,10 +42,6 @@ def validateFinaleScore(combinedOntologyName, stdNamed, inputFile, TargetFile, w for counter, line in enumerate(targetList): #NoMatch from std to the created mapping file, so this goes to the missing List if line not in inputList: - #if line[0]=="http://purl.obolibrary.org/obo/HP_0009059" or line[0]=="http://purl.obolibrary.org/obo/HP_0025247": - #print line - #print targetLongList[counter] - missing.append([line[0], line[1], "NoScore", targetLongList[counter][counterPosition]]) #Exact same Result for both, so this is a match. Is added to the matches List @@ -70,7 +67,6 @@ def validateFinaleScore(combinedOntologyName, stdNamed, inputFile, TargetFile, w for miss in missing: if sug[0]==miss[0] and sug[1]!=miss[1]: alternativeCounter=alternativeCounter+1 - #print sug[0] +" mapped to "+sug[1]+" and "+miss[1] unrealMiss.append(sug) unrealMiss.append(miss) #Real miss @@ -80,23 +76,15 @@ def validateFinaleScore(combinedOntologyName, stdNamed, inputFile, TargetFile, w result=matches+missing+alternatives#+discarted - we can also show the discarted terms or put them in an own file - - #print "unrealMiss" - #print unrealMiss - #print "Real Miss" - #print realMiss - #print "Result" - #print result - #If we write to disc, I get the labels of the parts that are NOT mapped to the standard if writeToDisc is True: print "Try to save the result" obsoleteScore=0 for row in result: #if row[2]=='NoScore' or row[3]=='noScore': - print "Need to annotate "+row[0]+" and "+row[1] + #print "Need to annotate "+row[0]+" and "+row[1] - data={'q':row[0],'queryFields':'iri', 'fieldList': 'label'} + data={'q':row[0],'queryFields':'iri', 'fieldList': 'label', "ontology":onto1, "type":"class", "local":True} try: r = requests.get(url, data) @@ -114,15 +102,16 @@ def validateFinaleScore(combinedOntologyName, stdNamed, inputFile, TargetFile, w logging.info(r.request.url) #raise - jsonReply=r.json() + try: + jsonReply=r.json() row.append(jsonReply['response']['docs'][0]['label'].encode(encoding='UTF-8')) except: row.append('NoLabel Found') obsoleteScore=obsoleteScore+1 print "No Label found in the first row" - data={'q':row[1],'queryFields':'iri', 'fieldList': 'label'} + data={'q':row[1],'queryFields':'iri', 'fieldList': 'label', "ontology":onto2, "type":"class", "local":True} try: r = requests.get(url, data) except: @@ -138,8 +127,9 @@ def validateFinaleScore(combinedOntologyName, stdNamed, inputFile, TargetFile, w logging.info(r.status_code) logging.info(r.request.url) - jsonReply=r.json() + try: + jsonReply=r.json() row.append(jsonReply['response']['docs'][0]['label'].encode(encoding='UTF-8')) except: row.append('NoLabel Found') @@ -147,31 +137,34 @@ def validateFinaleScore(combinedOntologyName, stdNamed, inputFile, TargetFile, w print "No Label found in the second row" - with open('pipeline_output/'+combinedOntologyName+'_'+stdNamed+'_validate.csv', 'wb') as f: + with open('pipeline_output/'+onto1+"_"+onto2+'_'+stdNamed+'_validate.csv', 'wb') as f: writer = csv.writer(f) writer.writerows(result) f.close() #Logging the stats of this validation - logging.info("ParameterSet for this validation run: ") - logging.info("threshold: "+str(params["threshold"])) - logging.info("fuzzyUpperLimit: "+str(params["fuzzyUpperLimit"])) - logging.info("fuzzyLowerLimit: "+str(params["fuzzyLowerLimit"])) - logging.info("fuzzyUpperFactor: "+str(params["fuzzyUpperFactor"])) - logging.info("fuzzyLowerFactor: "+str(params["fuzzyLowerFactor"])) - logging.info("oxoDistanceOne: "+str(params["oxoDistanceOne"])) - logging.info("oxoDistanceTwo: "+str(params["oxoDistanceTwo"])) - logging.info("oxoDistanceThree: "+str(params["oxoDistanceThree"])) - logging.info("synFuzzyFactor: "+str(params["synFuzzyFactor"])) - logging.info("synOxoFactor: "+str(params["synOxoFactor"])) - - logging.info("Stats for "+combinedOntologyName+" validation "+stdNamed) - logging.info("Number of std mappings:"+str(len(targetList))) - logging.info("Total Matches: "+str(len(matches))) - logging.info("Algorithm missed compared to std: "+str(len(missing))) - logging.info("Suspected Obsoleted Terms: "+str(obsoleteScore)) - logging.info("Total alternative terms suggested: "+str(len(alternatives))) - logging.info("AlternativeOverlappingWithMisses:"+str(alternativeCounter)+"\n") + logging.info("ParameterSet for this validation run , ") + logging.info("threshold , "+str(params["threshold"])) + logging.info("fuzzyUpperLimit , "+str(params["fuzzyUpperLimit"])) + logging.info("fuzzyLowerLimit , "+str(params["fuzzyLowerLimit"])) + logging.info("fuzzyUpperFactor , "+str(params["fuzzyUpperFactor"])) + logging.info("fuzzyLowerFactor , "+str(params["fuzzyLowerFactor"])) + logging.info("oxoDistanceOne , "+str(params["oxoDistanceOne"])) + logging.info("oxoDistanceTwo , "+str(params["oxoDistanceTwo"])) + logging.info("oxoDistanceThree , "+str(params["oxoDistanceThree"])) + logging.info("synFuzzyFactor , "+str(params["synFuzzyFactor"])) + logging.info("synOxoFactor , "+str(params["synOxoFactor"])) + + logging.info("Stats for "+str(onto1)+"_"+str(onto2)+" validation "+stdNamed) + logging.info("Number of std mappings ,"+str(len(targetList))) + logging.info("Total Matches ,"+str(len(matches))) + logging.info("Algorithm missed compared to std ,"+str(len(missing))) + logging.info("Suspected Obsoleted Terms ,"+str(obsoleteScore)) + logging.info("Algorithm missed compared to std MINUS obsoleted terms in std ,"+str(len(missing)-obsoleteScore)) + logging.info("Total unique terms suggested ,"+str(len(alternatives))) + logging.info("UniqueOverlappingWithMisses ,"+str(alternativeCounter)) + logging.info("Recall ,"+str((len(matches)/(len(targetList)-obsoleteScore*1.0))*100)+" in %\n") + #logging.info("NotMapped: "+str(len(discarted))+"\n") From 3a3b0cb2683475be94c276da1f9dc5280277a2e5 Mon Sep 17 00:00:00 2001 From: LLTommy Date: Mon, 18 Dec 2017 14:22:25 +0100 Subject: [PATCH 07/66] Remove unneeded url --- paxo/clientOperations.py | 2 +- paxo/validation.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paxo/clientOperations.py b/paxo/clientOperations.py index 2d818eb..b207ef5 100644 --- a/paxo/clientOperations.py +++ b/paxo/clientOperations.py @@ -9,7 +9,7 @@ import ast url="https://www.ebi.ac.uk/ols/api/" -#url="http://snarf.ebi.ac.uk:8980/ols-beta/api" + #Compares to ontologies from the OLS. This process can take a while and procudes a csv with primary results def scoreOntologies(sourceOntology, targetOntology, scoreParams): diff --git a/paxo/validation.py b/paxo/validation.py index b80914e..27631e6 100644 --- a/paxo/validation.py +++ b/paxo/validation.py @@ -4,7 +4,7 @@ import time #url="https://www.ebi.ac.uk/ols/api/search" -url="http://snarf.ebi.ac.uk:8980/ols-beta/api/search" + def validateFinaleScore(onto1, onto2, stdNamed, inputFile, TargetFile, writeToDisc, params, parseParms): uri1Position=parseParms['uri1'] From 209177b99c8fb06f9b41a642ca93e56393e6e577 Mon Sep 17 00:00:00 2001 From: LLTommy Date: Wed, 20 Dec 2017 12:06:19 +0100 Subject: [PATCH 08/66] Preparing the use of configs to drive the process --- paxo/clientOperations.py | 278 ++++++++++++++++++++++++--------------- paxo/flaskMapping.py | 17 +-- paxo/validation.py | 60 +++++---- 3 files changed, 216 insertions(+), 139 deletions(-) diff --git a/paxo/clientOperations.py b/paxo/clientOperations.py index b207ef5..9c226dc 100644 --- a/paxo/clientOperations.py +++ b/paxo/clientOperations.py @@ -8,17 +8,24 @@ from ConfigParser import SafeConfigParser import ast -url="https://www.ebi.ac.uk/ols/api/" - #Compares to ontologies from the OLS. This process can take a while and procudes a csv with primary results -def scoreOntologies(sourceOntology, targetOntology, scoreParams): +def scoreOntologies(sourceOntology, targetOntology, scoreParams, scoringtargetFolder): logging.info("Start scoring "+sourceOntology+" and "+targetOntology) #Check for the smaller ontology - r = requests.get(url+"ontologies/"+sourceOntology) - numberOfTerms=r.json()['numberOfTerms'] - r = requests.get(url+"ontologies/"+targetOntology) - numberOfTerms2 = r.json()['numberOfTerms'] + url=config.get("Basics","olsAPIURL") + + try: + r = requests.get(url+"ontologies/"+sourceOntology) + numberOfTerms=r.json()['numberOfTerms'] + r = requests.get(url+"ontologies/"+targetOntology) + numberOfTerms2 = r.json()['numberOfTerms'] + except: + logging.error("Error getting number of terms throw webservice call!") + logging.error(url+"ontologies/"+sourceOntology) + logging.error(url+"ontologies/"+targetOntology) + logging.error(r) + raise #In case the targetOntology is smaller than the source Ontology, switch the output if (numberOfTerms>numberOfTerms2): @@ -26,7 +33,7 @@ def scoreOntologies(sourceOntology, targetOntology, scoreParams): sourceOntology=targetOntology targetOntology=tmpOntology - termsUrl=url+"ontologies/"+sourceOntology+"/terms?size=100&fieldList=iri,label,synonym" + termsUrl=url+"ontologies/"+sourceOntology+"/terms?size=5&fieldList=iri,label,synonym" results=[] results.append(["sourceLabel","sourceIRI", "fuzzy", "oxo", "synFuzzy", "synOxo", "bridgeTerms"]) @@ -100,14 +107,16 @@ def scoreOntologies(sourceOntology, targetOntology, scoreParams): print "Reached last page I recon" break - with open('pipeline_output/scoring_output_'+sourceOntology+'_'+targetOntology+'.csv', 'w') as f: + + with open(scoringtargetFolder+'scoring_output_'+sourceOntology+'_'+targetOntology+'.csv', 'w') as f: writer = csv.writer(f) writer.writerows(results) f.close() #Read in and process the ontology primary score from a csv file -def scoreOntologyPrimaryScore(name): - with open("pipeline_output/"+name+"/scoring_output_"+name+".csv") as csvfile: +def scoreOntologyPrimaryScore(name, scorefolder): + + with open(scorefolder+"scoring_output_"+name+".csv") as csvfile: readCSV = csv.reader(csvfile, delimiter=',') scoreMatrix=[] @@ -176,23 +185,24 @@ def processOntologyPrimaryScore(pScore, params): return tmp #Maybe transfer to server -def scoreTermList(termList, targetOntology, params): +def scoreTermList(termList, targetOntology, scoreParams, params): result=[] for term in termList: - result.append(flaskMapping.scoreTermLabel(term, targetOntology, params)) + result.append(flaskMapping.scoreTermLabel(term, targetOntology, scoreParams, params)) return result # Process an IRI list via OLS instead of a termList # def scoreIriList(IriList, targetOntology, params): #Process scoredMatrix to prepare for validation or save to disc -def writeOutPutScore(scoredMatrix, name, saveToDisc): +def writeOutPutScore(scoredMatrix, name, predictedTargetFolder, saveToDisc): result=[] + for line in scoredMatrix: result.append([line[0]['sourceIRI'], line[0]['iri'], line[0]['finaleScore'], line[0]['sourceTerm']]) if saveToDisc==True: - with open('pipeline_output/calculated_output_'+name+'.csv', 'w') as f: + with open(predictedTargetFolder+'calculated_output_'+name+'.csv', 'w') as f: writer = csv.writer(f) writer.writerows(result) f.close() @@ -219,27 +229,27 @@ def curationOntologyFinalScore(scoredMatrix): #print scoredMatrix[counter] if unified[index][2] -175? -#Best run at 39 of 50 (best like=-175) with parameter set: -#[ 5.17002538e-05 9.97186358e-01 5.14350766e-03 5.08109418e-01 -# 9.98062574e-01 9.11663359e-02 4.59066160e-01 6.72066822e-01 -# 4.07275455e-01] + fuzzyUpperLimit=float(config.get(section,'fuzzyUpperLimit')) + fuzzyLowerLimit=float(config.get(section,'fuzzyLowerLimit')) + fuzzyUpperFactor=float(config.get(section,'fuzzyUpperFactor')) + fuzzyLowerFactor=float(config.get(section,'fuzzyLowerFactor')) + oxoDistanceOne=float(config.get(section,'oxoDistanceOne')) + oxoDistanceTwo=float(config.get(section,'oxoDistanceTwo')) + oxoDistanceThree=float(config.get(section,'oxoDistanceThree')) + synFuzzyFactor=float(config.get(section,'synFuzzyFactor')) + synOxoFactor=float(config.get(section,'synOxoFactor')) + bridgeOxoFactor=float(config.get(section,'bridgeOxoFactor')) + threshold=float(config.get(section,'threshold')) + params={"fuzzyUpperLimit": fuzzyUpperLimit, "fuzzyLowerLimit": fuzzyLowerLimit,"fuzzyUpperFactor": fuzzyUpperFactor,"fuzzyLowerFactor":fuzzyLowerFactor, "oxoDistanceOne":oxoDistanceOne, "oxoDistanceTwo":oxoDistanceTwo, "oxoDistanceThree":oxoDistanceThree, "synFuzzyFactor":synFuzzyFactor, "synOxoFactor": synOxoFactor, "bridgeOxoFactor":bridgeOxoFactor, "threshold":threshold} -#params={"fuzzyUpperLimit": 0.00005, "fuzzyLowerLimit": 0.99,"fuzzyUpperFactor": 0.5,"fuzzyLowerFactor":0.99, "oxoDistanceOne":0.09, "oxoDistanceTwo":0.459, "oxoDistanceThree":0.67, "synFuzzyFactor":0.41, "synOxoFactor": 0.38, "threshold":0.6} + print predictedTargetFolder + print "Calculate "+sourceOntology+" "+targetOntology + print scoringTargetFolder + return calculatePrimaryScore(sourceOntology+"_"+targetOntology, params, scoringTargetFolder, writeToDisc, predictedTargetFolder, curationOfDoubleEntries) -#params={"fuzzyUpperLimit": 0.8, "fuzzyLowerLimit": 0.6,"fuzzyUpperFactor": 1,"fuzzyLowerFactor":0.6, "oxoDistanceOne":1, "oxoDistanceTwo":0.3, "oxoDistanceThree":0.1, "synFuzzyFactor":1, "synOxoFactor": 0.4, "threshold":0.6} +config = SafeConfigParser() +config.read("config.ini") +logFile=config.get("Basics","logFile") +logging.basicConfig(filename=logFile, level=logging.INFO, format='%(asctime)s - %(message)s') -###Score all Ontologies in the config file -#scoreListOntologies(sections) -#config.get() -#oxoURL=config.get("replacements","oxoURL") -#print list(config.items('replacements')) +#writeToDiscFlag=config.get("Basics", ...) +writeToDiscFlag=False +uniqueMaps=True -removeStopwordsList=['of', 'the'] -replaceTermList=[('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('abnormality','disease')] -scoreParams={"removeStopwordsList": removeStopwordsList, "replaceTermList" :replaceTermList} +sections=config.sections()[2:] -hp_doid_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : [('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('abnormality','disease')]} +#Control via config file +#scoreListOntologies(sections) +#calculateListOntologies(sections, writeToDiscFlag, uniqueMaps) +#calculateAndValidateListOntologies(sections, writeToDiscFlag, uniqueMaps) +#removeStopwordsList=['of', 'the'] +#replaceTermList=[('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('abnormality','disease')] +#scoreParams={"removeStopwordsList": removeStopwordsList, "replaceTermList" :replaceTermList} +#hp_doid_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : [('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('abnormality','disease')]} ### Primary score ontologies -ordo_hp_scoreParams={"removeStopwordsList": ['of', 'the', 'Rare'], "replaceTermList" : [('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('tumor', 'neoplasm'), ('tumor','cancer'), ('abnormality', 'disease'), ('decreased', 'reduced'), ('morphology', '')]} -scoreOntologies("ordo","hp", ordo_hp_scoreParams) - -doid_ordo_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} -scoreOntologies("doid","mp", doid_ordo_scoreParams) -# -doid_ordo_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} -scoreOntologies("doid","ordo", doid_ordo_scoreParams) -# -hp_doid_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : [('cancer', 'neoplasm'), ('cancer','carcinoma'), ('abnormality','disease'), 'abnormality','disease']} -scoreOntologies("hp","doid",hp_doid_scoreParams) -# -hp_mp_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : [('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('abnormality','disease'), ('abnormal','Abnormality')]} -scoreOntologies("hp","mp", hp_mp_scoreParams) +#ordo_hp_scoreParams={"removeStopwordsList": ['of', 'the', 'Rare'], "replaceTermList" : [('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('tumor', 'neoplasm'), ('tumor','cancer'), ('abnormality', 'disease'), ('decreased', 'reduced'), ('morphology', '')]} +#scoreOntologies("ordo","hp", ordo_hp_scoreParams, 'final_dec/scoring/') # -ordo_mp_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} -scoreOntologies("ordo","mp", ordo_mp_scoreParams) +#doid_mp_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} +#scoreOntologies("doid","mp", doid_mp_scoreParams, 'final_dec/scoring/') +# # +# doid_ordo_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} +# scoreOntologies("doid","ordo", doid_ordo_scoreParams, 'final_dec/scoring/') +# # +# hp_doid_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : [('cancer', 'neoplasm'), ('cancer','carcinoma'), ('abnormality','disease'), 'abnormality','disease']} +# scoreOntologies("hp","doid",hp_doid_scoreParams, 'final_dec/scoring/') +# # +# hp_mp_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : [('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('abnormality','disease'), ('abnormal','Abnormality')]} +# scoreOntologies("hp","mp", hp_mp_scoreParams, 'final_dec/scoring/') +# # +# ordo_mp_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} +# scoreOntologies("ordo","mp", ordo_mp_scoreParams, 'final_dec/scoring/') +#mesh_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} +#scoreOntologies("mesh","hp", mesh_scoreParams, 'final_dec/scoring/') +#scoreOntologies("mesh","doid", mesh_scoreParams, 'final_dec/scoring/') +#scoreOntologies("mesh","ordo", mesh_scoreParams, 'final_dec/scoring/') +#scoreOntologies("mesh","mp", mesh_scoreParams, 'final_dec/scoring/') -#scoreOntologies("mesh","hp") -#scoreOntologies("mesh","doid") -#scoreOntologies("mesh","ordo") -#scoreOntologies("mesh","mp") -writeToDisc=False -uniqueMaps=False +#Could/Should be changed so parameters come from the config file +params={"fuzzyUpperLimit": 0.8, "fuzzyLowerLimit": 0.6,"fuzzyUpperFactor": 1,"fuzzyLowerFactor":0.6, "oxoDistanceOne":1, "oxoDistanceTwo":0.3, "oxoDistanceThree":0.1, "synFuzzyFactor":0.6, "synOxoFactor": 0.4, "bridgeOxoFactor":1, "threshold":0.6} +#params={"fuzzyUpperLimit": 0.8, "fuzzyLowerLimit": 0.6,"fuzzyUpperFactor": 1,"fuzzyLowerFactor":0.6, "oxoDistanceOne":1, "oxoDistanceTwo":0.3, "oxoDistanceThree":0.1, "synFuzzyFactor":0.6, "synOxoFactor": 0.4, "bridgeOxoFactor":1, "threshold":0.8} + ### Execute Calculate and validate for a certain file -# print calculateAndValidateOntologyPrimaryScore('hp', 'doid', 'loom', 'Loom/DOID_HP_loom.csv', params, writeToDisc, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps) -# print calculateAndValidateOntologyPrimaryScore('hp','doid', 'silver','silver_nov/Consensus-3-hp-doid.tsv', params, writeToDisc, {'uri1':0, 'uri2':2, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps) -# print calculateAndValidateOntologyPrimaryScore('ordo', 'hp', 'loom', 'Loom/ordo_hp_loom.csv', params, writeToDisc, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps) -# print calculateAndValidateOntologyPrimaryScore('ordo', 'hp', 'silver','silver_nov/Consensus-3-hp-ordo.tsv', params, writeToDisc, {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps) +#print calculateAndValidateOntologyPrimaryScore('hp', 'doid', 'loom', 'Loom/DOID_HP_loom.csv', params, 'final_dec/scoring/', writeToDiscFlag, 'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') +#print calculateAndValidateOntologyPrimaryScore('hp','doid', 'silver','silver_nov/Consensus-3-hp-doid.tsv', params, 'final_dec/scoring/', writeToDiscFlag, 'final_dec/predicted/',{'uri1':0, 'uri2':2, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') +#print calculateAndValidateOntologyPrimaryScore('ordo', 'hp', 'loom', 'Loom/ordo_hp_loom.csv', params,'final_dec/scoring/', writeToDisc, final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') + + +#params={"fuzzyUpperLimit": 0, "fuzzyLowerLimit": 0,"fuzzyUpperFactor": 0.65, "fuzzyLowerFactor":0, "oxoDistanceOne":0.00029, "oxoDistanceTwo":0.57, "oxoDistanceThree":0.027, "synFuzzyFactor":0.247, "synOxoFactor": 0.62, "bridgeOxoFactor":0.829, "threshold":0.6} + +print calculateAndValidateOntologyPrimaryScore('ordo', 'hp', 'silver','silver_nov/Consensus-3-hp-ordo.tsv', params,'final_dec/scoring/', writeToDiscFlag, 'final_dec/predicted/',{'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') +# {'misses': 210, 'alternatives': 350} + # # -# print calculateAndValidateOntologyPrimaryScore('mp','hp', 'loom','Loom/MP_HP_loom.csv', params, writeToDisc, {'uri1':0, 'uri2':1, 'scorePosition':2 , 'delimiter':','}, uniqueMaps) -# print calculateAndValidateOntologyPrimaryScore('mp','hp', 'silver','silver_nov/Consensus-3-hp-mp.tsv', params, writeToDisc, {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps) -# print calculateAndValidateOntologyPrimaryScore('ordo','doid', 'loom' ,'Loom/DOID_ORDO_loom.csv', params, writeToDisc, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps) -# print calculateAndValidateOntologyPrimaryScore('ordo','doid', 'silver','silver_nov/Consensus-3-doid-ordo.tsv', params, writeToDisc, {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps) -# print calculateAndValidateOntologyPrimaryScore('ordo','mp', 'loom', 'Loom/mp_ordo_loom.csv', params, writeToDisc, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps) -# print calculateAndValidateOntologyPrimaryScore('ordo','mp', 'silver','silver_nov/Consensus-3-mp-ordo.tsv', params, writeToDisc, {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps) -# print calculateAndValidateOntologyPrimaryScore('mp','doid', 'loom', 'Loom/DOID_MP_loom.csv', params, writeToDisc, {'uri1':1, 'uri2':0, 'scorePosition':2, 'delimiter':','}, uniqueMaps) -# print calculateAndValidateOntologyPrimaryScore('mp','doid', 'silver','silver_nov/Consensus-3-mp-doid.tsv', params, writeToDisc, {'uri1':0, 'uri2':2, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps) +# print calculateAndValidateOntologyPrimaryScore('mp','hp', 'loom','Loom/MP_HP_loom.csv', params,'final_dec/scoring/', writeToDiscFag, 'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2 , 'delimiter':','}, uniqueMaps, 'final_dec/validation/') +print calculateAndValidateOntologyPrimaryScore('mp','hp', 'silver','silver_nov/Consensus-3-hp-mp.tsv', params, 'final_dec/scoring/',writeToDiscFlag, 'final_dec/predicted/', {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') +# print calculateAndValidateOntologyPrimaryScore('ordo','doid', 'loom' ,'Loom/DOID_ORDO_loom.csv', params, 'final_dec/scoring/',writeToDisc, 'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') +print calculateAndValidateOntologyPrimaryScore('ordo','doid', 'silver','silver_nov/Consensus-3-doid-ordo.tsv', params, 'final_dec/scoring/', writeToDiscFlag,'final_dec/predicted/', {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') +# print calculateAndValidateOntologyPrimaryScore('ordo','mp', 'loom', 'Loom/mp_ordo_loom.csv', params,'final_dec/scoring/', writeToDiscFlag, 'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') +print calculateAndValidateOntologyPrimaryScore('ordo','mp', 'silver','silver_nov/Consensus-3-mp-ordo.tsv', params,'final_dec/scoring/', writeToDiscFlag, 'final_dec/predicted/', {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') +#print calculateAndValidateOntologyPrimaryScore('mp','doid', 'loom', 'Loom/DOID_MP_loom.csv', params, 'final_dec/scoring/',writeToDiscFlag, 'final_dec/predicted/', {'uri1':1, 'uri2':0, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') +print calculateAndValidateOntologyPrimaryScore('mp','doid', 'silver','silver_nov/Consensus-3-mp-doid.tsv', params, 'final_dec/scoring/',writeToDiscFlag,'final_dec/predicted/', {'uri1':0, 'uri2':2, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') # # # -# print calculateAndValidateOntologyPrimaryScore('mesh','doid', 'loom', 'Loom/DOID_MESH_loom_new.csv', params, writeToDisc, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps) -# print calculateAndValidateOntologyPrimaryScore('mesh','doid', 'silver', 'silver_nov/Consensus-3-doid-mesh3.tsv', params, writeToDisc, {'uri1':2, 'uri2':0, 'scorePosition':2, 'delimiter':'\t'}, uniqueMaps) -# print calculateAndValidateOntologyPrimaryScore('mesh','hp', 'loom', 'Loom/mesh_hp_loom_new.csv', params, writeToDisc, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps) -# print calculateAndValidateOntologyPrimaryScore('mesh','doid', 'silver', 'silver_nov/Consensus-3-doid-mesh3.tsv', params, writeToDisc, {'uri1':1, 'uri2':0, 'scorePosition':2, 'delimiter':'\t'}, uniqueMaps) -# print calculateAndValidateOntologyPrimaryScore('mesh','mp', 'loom', 'Loom/mesh_mp_loom_new.csv', params, writeToDisc, {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps) -# print calculateAndValidateOntologyPrimaryScore('mesh','doid', 'silver', 'silver_nov/Consensus-3-doid-mesh3.tsv', params, writeToDisc, {'uri1':1, 'uri2':0, 'scorePosition':2, 'delimiter':'\t'}, uniqueMaps) -###Execute functions for terms (#Broken since last change?) -# --> needs to be invesitgate! -#print flaskMapping.scoreTermLabel("Nuclear cataract", "doid", params) +# print calculateAndValidateOntologyPrimaryScore('mesh','doid', 'loom', 'Loom/DOID_MESH_loom_new.csv', params, 'final_dec/scoring/',writeToDisc,'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') +# print calculateAndValidateOntologyPrimaryScore('mesh','doid', 'silver', 'silver_nov/Consensus-3-doid-mesh3.tsv', 'final_dec/scoring/',params, writeToDisc, 'final_dec/predicted/', {'uri1':2, 'uri2':0, 'scorePosition':2, 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') +# print calculateAndValidateOntologyPrimaryScore('mesh','hp', 'loom', 'Loom/mesh_hp_loom_new.csv', params, 'final_dec/scoring/',writeToDisc,'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') +# print calculateAndValidateOntologyPrimaryScore('mesh','hp', 'silver', 'silver_nov/Consensus-3-hp-mesh3.tsv', params, 'final_dec/scoring/',writeToDisc,'final_dec/predicted/', {'uri1':1, 'uri2':0, 'scorePosition':2, 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') +# print calculateAndValidateOntologyPrimaryScore('mesh','mp', 'loom', 'Loom/mesh_mp_loom_new.csv', params, 'final_dec/scoring/',writeToDisc,'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') +# print calculateAndValidateOntologyPrimaryScore('mesh','mp', 'silver', 'silver_nov/Consensus-3-mp-mesh3.tsv', params,'final_dec/scoring/', writeToDisc, 'final_dec/predicted/', {'uri1':1, 'uri2':0, 'scorePosition':2, 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') -###Calculate the score from the primary file and run it against the validation files. For all Ontologies in the config file -#calculateAndValidateListOntologies(sections) + +###Execute functions for terms (#Broken since last change?) +#scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} +#print flaskMapping.scoreTermLabel("Nuclear cataract", "doid", scoreParams, params) diff --git a/paxo/flaskMapping.py b/paxo/flaskMapping.py index 8d8f2e1..8034d12 100644 --- a/paxo/flaskMapping.py +++ b/paxo/flaskMapping.py @@ -16,9 +16,9 @@ searchURL=config.get("Basics","olsURL") oxoURL=config.get("Basics","oxoURL") +logFile=config.get("Basics","logFile") -logging.basicConfig(filename="flask.log", level=logging.INFO, format='%(asctime)s - %(message)s') - +logging.basicConfig(filename=logFile, level=logging.INFO, format='%(asctime)s - %(message)s') @app.route("/") @@ -401,12 +401,6 @@ def simplifyProcessedPscore(mapping): #print "made it to the end of simplifyProcessedPscore" - # Should do this bridgy thing here - #if mapping['bridgeEvidence']==None - # - # - # - # return scoreMatrix #Simple Score mechanism for all subscores, returns a sorted list. Is Called after simplifyProcessedPscore @@ -427,6 +421,7 @@ def scoreSimple(scoreMatrix, params): synFuzzyFactor=params['synFuzzyFactor'] synOxoFactor=params['synOxoFactor'] + bridgeOxoFactor=params['bridgeOxoFactor'] resultMatrix=[] for i,score in enumerate(scoreMatrix): @@ -454,7 +449,7 @@ def scoreSimple(scoreMatrix, params): print scoreMatrix[i] - score['finaleScore']=score['fuzzyScore']*fFactor+score['oxoScore']+score['synFuzzy']*synFuzzyFactor+score['synOxo']*synOxoFactor+score['bridgeOxoScore'] + score['finaleScore']=score['fuzzyScore']*fFactor+score['oxoScore']+score['synFuzzy']*synFuzzyFactor+score['synOxo']*synOxoFactor+score['bridgeOxoScore']*bridgeOxoFactor ### Do we want unknown to be printed if score['finaleScore']>threshold: #This removes "unknow" from the results and weak results @@ -471,8 +466,8 @@ def scoreSimple(scoreMatrix, params): #def scoreComplex(scoreMatrix): #Calls all necessary steps to get a result for a termLabel -def scoreTermLabel(termLabel, targetOntology, params): - pscore=primaryScoreTerm('', termLabel, targetOntology, params) #Executes the basic calls to OLS and OXO, delievers primary score +def scoreTermLabel(termLabel, targetOntology, scoreParams, params): + pscore=primaryScoreTerm('', termLabel, targetOntology, scoreParams) #Executes the basic calls to OLS and OXO, delievers primary score pscore['sourceIRI']="UNKNOWN" calculatedMappings=processPScore(pscore) #Process the primaryScore, weighting the primary results calculatedMappings['sourceIRI']="UNKNOWN" diff --git a/paxo/validation.py b/paxo/validation.py index 27631e6..def2d79 100644 --- a/paxo/validation.py +++ b/paxo/validation.py @@ -2,16 +2,28 @@ import logging import requests import time - +from ConfigParser import SafeConfigParser #url="https://www.ebi.ac.uk/ols/api/search" +#url="http://snarf.ebi.ac.uk:8980/ols-beta/api/search" +config = SafeConfigParser() +config.read("config.ini") -def validateFinaleScore(onto1, onto2, stdNamed, inputFile, TargetFile, writeToDisc, params, parseParms): +def validateFinaleScore(onto1, onto2, stdNamed, inputFile, TargetFile, writeToDisc, params, parseParms, validationTargetFolder): uri1Position=parseParms['uri1'] uri2Position=parseParms['uri2'] counterPosition=parseParms['scorePosition'] delimiterChar=parseParms['delimiter'] + url=config.get("Basics","olsURL") + + print inputFile + print TargetFile + print validationTargetFolder + print uri1Position + print uri2Position + + logging.basicConfig(filename="flask.log", level=logging.INFO, format='%(asctime)s - %(message)s') inputList=[] @@ -65,7 +77,7 @@ def validateFinaleScore(onto1, onto2, stdNamed, inputFile, TargetFile, writeToDi realMiss=[] for sug in alternatives: for miss in missing: - if sug[0]==miss[0] and sug[1]!=miss[1]: + if (sug[0]==miss[0] and sug[1]!=miss[1]) or (sug[0]!=miss[0] and sug[1]==miss[1]): alternativeCounter=alternativeCounter+1 unrealMiss.append(sug) unrealMiss.append(miss) @@ -137,33 +149,33 @@ def validateFinaleScore(onto1, onto2, stdNamed, inputFile, TargetFile, writeToDi print "No Label found in the second row" - with open('pipeline_output/'+onto1+"_"+onto2+'_'+stdNamed+'_validate.csv', 'wb') as f: + with open(validationTargetFolder+onto1+"_"+onto2+'_'+stdNamed+'_validate.csv', 'wb') as f: writer = csv.writer(f) writer.writerows(result) f.close() #Logging the stats of this validation - logging.info("ParameterSet for this validation run , ") - logging.info("threshold , "+str(params["threshold"])) - logging.info("fuzzyUpperLimit , "+str(params["fuzzyUpperLimit"])) - logging.info("fuzzyLowerLimit , "+str(params["fuzzyLowerLimit"])) - logging.info("fuzzyUpperFactor , "+str(params["fuzzyUpperFactor"])) - logging.info("fuzzyLowerFactor , "+str(params["fuzzyLowerFactor"])) - logging.info("oxoDistanceOne , "+str(params["oxoDistanceOne"])) - logging.info("oxoDistanceTwo , "+str(params["oxoDistanceTwo"])) - logging.info("oxoDistanceThree , "+str(params["oxoDistanceThree"])) - logging.info("synFuzzyFactor , "+str(params["synFuzzyFactor"])) - logging.info("synOxoFactor , "+str(params["synOxoFactor"])) + logging.info("ParameterSet for this validation run, ") + logging.info("threshold, "+str(params["threshold"])) + logging.info("fuzzyUpperLimit, "+str(params["fuzzyUpperLimit"])) + logging.info("fuzzyLowerLimit, "+str(params["fuzzyLowerLimit"])) + logging.info("fuzzyUpperFactor, "+str(params["fuzzyUpperFactor"])) + logging.info("fuzzyLowerFactor, "+str(params["fuzzyLowerFactor"])) + logging.info("oxoDistanceOne, "+str(params["oxoDistanceOne"])) + logging.info("oxoDistanceTwo, "+str(params["oxoDistanceTwo"])) + logging.info("oxoDistanceThree, "+str(params["oxoDistanceThree"])) + logging.info("synFuzzyFactor, "+str(params["synFuzzyFactor"])) + logging.info("synOxoFactor, "+str(params["synOxoFactor"])) logging.info("Stats for "+str(onto1)+"_"+str(onto2)+" validation "+stdNamed) - logging.info("Number of std mappings ,"+str(len(targetList))) - logging.info("Total Matches ,"+str(len(matches))) - logging.info("Algorithm missed compared to std ,"+str(len(missing))) - logging.info("Suspected Obsoleted Terms ,"+str(obsoleteScore)) - logging.info("Algorithm missed compared to std MINUS obsoleted terms in std ,"+str(len(missing)-obsoleteScore)) - logging.info("Total unique terms suggested ,"+str(len(alternatives))) - logging.info("UniqueOverlappingWithMisses ,"+str(alternativeCounter)) - logging.info("Recall ,"+str((len(matches)/(len(targetList)-obsoleteScore*1.0))*100)+" in %\n") + logging.info("Number of std mappings, "+str(len(targetList))) + logging.info("Total Matches, "+str(len(matches))) + logging.info("Algorithm missed compared to std, "+str(len(missing))) + logging.info("Suspected Obsoleted Terms, "+str(obsoleteScore)) + logging.info("Algorithm missed compared to std MINUS obsoleted terms in std, "+str(len(missing)-obsoleteScore)) + logging.info("Total unique terms suggested, "+str(len(alternatives))) + logging.info("UniqueOverlappingWithMisses, "+str(alternativeCounter)) + logging.info("Recall, "+str((len(matches)/(len(targetList)-obsoleteScore*1.0))*100)+" in %\n") #logging.info("NotMapped: "+str(len(discarted))+"\n") @@ -171,4 +183,4 @@ def validateFinaleScore(onto1, onto2, stdNamed, inputFile, TargetFile, writeToDi #return result return {"misses": len(missing), "alternatives": len(alternatives)} #Return the parameters and the result of the validation. This shall be useful in the future - #return {"misses": len(missing), "alternativeCounter": alternativeCounter ,"params":runParams} + #return {"misses": len(missing), "alternativeCounter": alternativeCounter, "params":runParams} From bd809e3ac7b381f60dbbe16e3107ab4336c3b036 Mon Sep 17 00:00:00 2001 From: LLTommy Date: Wed, 10 Jan 2018 16:51:28 +0000 Subject: [PATCH 09/66] Restructuring the project, folders, including extensive config file use, including oxo term loader --- dataloading/oxo/MappingLoader.py | 441 ++++++++++++++++++ dataloading/oxo/OlsDatasetLoader.py | 136 ++++++ dataloading/oxo/OxoClient.py | 173 +++++++ .../paxo}/clientOperations.py | 280 ++++++----- {paxo => dataloading/paxo}/flaskMapping.py | 101 ++-- dataloading/paxo/neoExporter.py | 165 +++++++ {paxo => dataloading/paxo}/readme.md | 0 {paxo => dataloading/paxo}/requirements.txt | 3 +- {paxo => dataloading/paxo}/validation.py | 42 +- 9 files changed, 1157 insertions(+), 184 deletions(-) create mode 100644 dataloading/oxo/MappingLoader.py create mode 100644 dataloading/oxo/OlsDatasetLoader.py create mode 100644 dataloading/oxo/OxoClient.py rename {paxo => dataloading/paxo}/clientOperations.py (56%) rename {paxo => dataloading/paxo}/flaskMapping.py (87%) create mode 100644 dataloading/paxo/neoExporter.py rename {paxo => dataloading/paxo}/readme.md (100%) rename {paxo => dataloading/paxo}/requirements.txt (74%) rename {paxo => dataloading/paxo}/validation.py (83%) diff --git a/dataloading/oxo/MappingLoader.py b/dataloading/oxo/MappingLoader.py new file mode 100644 index 0000000..afda731 --- /dev/null +++ b/dataloading/oxo/MappingLoader.py @@ -0,0 +1,441 @@ +import MySQLdb +import OxoClient as OXO +from pip._vendor.requests.packages.urllib3.connection import port_by_scheme +import urllib +import json +import xml.etree.ElementTree as ET +import yaml +import csv +import sys +import datetime +from neo4j.v1 import GraphDatabase, basic_auth +from ConfigParser import SafeConfigParser + + +config = SafeConfigParser() +config.read("../config/oxo_dataRelease_config.ini") + +OXO.oxoUrl=config.get("Basics","oxoUrl") +OXO.apikey=config.get("Basics","oxoAPIkey") +#OXO.olsurl=config.get("Basics","olsurl") +olsurl=config.get("Basics","olsurl") + +solrBaseUrl=config.get("Basics","solrBaseUrl") +getEfoAnnotationsUrl = solrBaseUrl+"/ontology/select?q=*%3A*&fq=ontology_name%3Aefo&rows=0&wt=csv&indent=true" +efoSolrQueryUrl = solrBaseUrl+"/ontology/select?fq=ontology_name%3Aefo&q=*&wt=json" +olsDbxerfSolrQuery = solrBaseUrl+"/ontology/select?q=hasDbXref_annotation%3A*+OR%0Adatabase_cross_reference_annotation%3A*+OR%0Ahas_alternative_id_annotation%3A*+OR%0Adefinition_citation_annotation%3A*&fl=iri%2Contology_name%2Clabel%2Cshort_form%2Cobo_id%2Cdatabase_cross_reference_annotation%2ChasDbXref_annotation%2C+definition_citation_annotation%2C+has_alternative_id_annotation+&wt=json&fq=!ontology_name%3Ancbitaxon&fq=!ontology_name%3Apr&fq=!ontology_name%3Avto&fq=!ontology_name%3Aogg" + +solrChunks=config.getint("Basics","solrChunks") +uri=config.get("Basics","neoURL") + +exportFileTerms=config.get("Paths","exportFileTerms") +exportFileMappings=config.get("Paths","exportFileMappings") + +user=config.get("SQLumls","user") +password=config.get("SQLumls","password") +host=config.get("SQLumls","host") +sqldb=config.get("SQLumls","db") +port=config.getint("SQLumls","port") + + +driver = GraphDatabase.driver(uri, auth=basic_auth("neo4j", "dba")) +session = driver.session() +print "neo success no sql" +db = MySQLdb.connect(user=user, passwd=password, + host=host, + db=sqldb, port=port) + + +# OLS loader +# get prefix data from OLS +prefixToPreferred = {} +termToIri = {} +termToLabel = {} +idorgNamespace = {} +prefixToDatasource = {} + +print "Reading datasources from OxO..." +for data in OXO.getOxODatasets(): + del data['_links'] + del data['description'] + prefix = data["prefix"] + prefixToDatasource[prefix] = data + prefixToPreferred[prefix] = prefix + for altPrefix in data["alternatePrefix"]: + prefixToPreferred[altPrefix] = prefix + if "idorgNamespace" in data: + idorgNamespace[altPrefix.lower()] = data["idorgNamespace"] + idorgNamespace[prefix.lower()] = data["idorgNamespace"] +# get total number of results + +knownAnnotations = [ + "database_cross_reference_annotation", + "hasDbXref_annotation" +] + +print "Reading datasources from OxO done" +# hack to get EFO xref annotations + +response = urllib.urlopen(getEfoAnnotationsUrl) +cr = csv.reader(response) + +for row in cr: + for p in row: + if 'definition_citation' in p: + knownAnnotations.append(p) + + + +unknownSource = {} + +terms = {} +mappings = {} +postMappings = [] + +def processSolrDocs(url): + rows = solrChunks + initUrl = url + "&start=0&rows=" + str(rows) + reply = urllib.urlopen(initUrl) + anwser = json.load(reply) + + size = anwser["response"]["numFound"] + + + for x in range(rows, size, rows): + for docs in anwser["response"]["docs"]: + fromPrefix = None + fromId = None + + fromIri = docs["iri"] + fromShortForm = docs["short_form"] + fromOntology = docs["ontology_name"] + fromLabel = docs["label"] + + if "obo_id" in docs: + fromOboId = docs["obo_id"] + fromPrefix = OXO.getPrefixFromCui(fromOboId) + fromId = OXO.getIdFromCui(fromOboId) + + #if fromPrefix=="orphanet": + # than use OLS API to check if it is an exact Match + + if not fromPrefix and not fromId: + fromPrefix = OXO.getPrefixFromCui(fromShortForm) + fromId = OXO.getIdFromCui(fromShortForm) + + if not fromPrefix: + print "Can't determine prefix for " + fromShortForm + " so skipping" + continue + + if not fromId: + print "Can't determine id for " + fromShortForm + " so skipping" + continue + # do we know the source term from the prefix? + + if fromPrefix not in prefixToPreferred: + print "unknown prefix " + fromPrefix + " so skipping" + continue + + fromPrefix = prefixToPreferred[fromPrefix] + fromCurie = fromPrefix + ":" + fromId + + if fromCurie not in terms: + terms[fromCurie] = { + "prefix": fromPrefix, + "id": fromId, + "curie": fromCurie, + "uri": fromIri, + "label": fromLabel + } + else: + terms[fromCurie]["uri"] = fromIri + terms[fromCurie]["label"] = fromLabel + + for anno in knownAnnotations: + if anno in docs: + for xref in docs[anno]: + if ":" in xref or "_" in xref: + toPrefix = OXO.getPrefixFromCui(xref) + toId = OXO.getIdFromCui(xref) + + if not toPrefix or not toId: + print "Can't get prefix or id for " + xref.encode('utf-8') + continue + + if not toPrefix: + print "Can't extract prefix for " + xref.encode('utf-8') + continue + if toPrefix.lower() not in prefixToPreferred: + unknownSource[toPrefix] = 1 + # print "Unknown prefix source for "+toPrefix+" so skipping" + continue + + + toPrefix = prefixToPreferred[toPrefix.lower()] + toCurie = toPrefix + ":" + toId + + if toCurie not in terms: + terms[toCurie] = { + "prefix": toPrefix, + "id": toId, + "curie": toCurie, + "uri": None, + "label":None + } + + if fromCurie == toCurie: + continue + + + if fromOntology not in prefixToPreferred: + print "mapping from unknown source " + fromOntology + continue + mapping = { + "fromId": fromCurie, + "toId": toCurie, + "datasourcePrefix": prefixToPreferred[fromOntology], + "sourceType": "ONTOLOGY", + "scope": "RELATED" + } + + postMappings.append(mapping) + + # if fromCurie not in termToIri: + # termToIri[fromCurie] = None + # if fromCurie not in termToLabel: + # termToLabel[fromCurie] = None + # if toCurie not in termToIri: + # termToIri[toCurie] = None + # if toCurie not in termToLabel: + # termToLabel[toCurie] = None + + # if to id is idorg, then mint the Uri + if idorgNamespace[toPrefix.lower()] is not None: + idorgUri = "http://identifiers.org/" + idorgNamespace[toPrefix.lower()] + "/" + toId + terms[toCurie]["uri"] = idorgUri + + print str(x) + # OXO.saveMappings(postMappings) + # postMappings = [] + initUrl = url + "&start=" + str(x) + "&rows=" + str(rows) + reply = urllib.urlopen(initUrl) + anwser = json.load(reply) + + +# do the query to get docs from solr and process + +processSolrDocs(efoSolrQueryUrl) + +print "done processing EFO, starting to query OLS" +processSolrDocs(olsDbxerfSolrQuery) +print "done processing OLS" + + +# url = "http://www.ebi.ac.uk/ols/api/search?q=*&fieldList=iri,short_form,obo_id,database_cross_reference_annotation" +# print "Updating term labels" +# # update URIs and labels for any terms we have seen +# for id in termToIri: +# if id not in termToIri and id not in termToLabel: +# print "Can't determine iri or label for "+id +# else: +# print "hello" +# OXO.updateTerm(id, termToIri[id], termToLabel[id]) + + +# dump out the list of unkonw sources +print "Finished, here are all the unknown sources" +for key, value in unknownSource.iteritems() : + + # see if we can match prefix to db + print key.encode('utf-8', 'ignore') + +# umls loader + + +cur = db.cursor() + +# Use all the SQL you like +cur.execute("select distinct cui,sab, scui, sdui, str from MRCONSO where stt = 'PF' and ts = 'P' and sab != 'src'") + +# print all the first cell of all the rows +idToLabel = {} +def getUMLSMappingFromRow(row): + cui = row[0] + source = row[1] + targetUi = row[2] + descId = row[3] + label = row[4] + + + toid = targetUi + if descId is not None: + toid = descId + + if toid is None: + return None + + if source == "HPO": + source = OXO.getPrefixFromCui(toid) + toid = OXO.getIdFromCui(toid) + + fromCurie = "UMLS:" + cui + + toCurie = prefixToPreferred[source] + ":" + toid + if fromCurie not in terms: + terms[fromCurie] = { + "prefix": "UMLS", + "id": cui, + "curie": fromCurie, + "uri": "http://identifiers.org/umls/"+cui, + "label": label + } + else: + terms[fromCurie]["label"] = label + + if toCurie not in terms: + terms[toCurie] = { + "prefix": prefixToPreferred[source], + "id": toid, + "curie": toCurie, + "label": label, + "uri": None + } + else: + terms[toCurie]["label"] = label + + if idorgNamespace[source.lower()]: + terms[toCurie]["uri"] = "http://identifiers.org/"+idorgNamespace[source.lower()]+"/"+toid + + mapping = { + "fromId": fromCurie, + "toId": toCurie, + "datasourcePrefix": "UMLS", + "sourceType": "DATABASE", + "scope": "RELATED" + } + # idToLabel[source+":"+toid] = label + return mapping + + +for row in cur.fetchall(): + try: + mappingRow = getUMLSMappingFromRow(row) + if mappingRow is not None: + postMappings.append(mappingRow) + except Exception as e: + print e + print "Experienced a problem with " + print row + print "Catched it and try to move on" + +db.close() + + + +print "Looking for OLS terms with no labels..." +for key, term in terms.iteritems(): + + if key == "VO:0000740": + print "aha" + if term["label"] is None: + prefix = OXO.getPrefixFromCui(key) + if prefixToDatasource[prefixToPreferred[prefix]]["source"] == "ONTOLOGY": + object = OXO.getIriAndLabelFromOls(term["curie"], olsurl) + if object is not None: + if term["uri"]: + terms[key]["uri"] = object["uri"] + if term["label"]: + terms[key]["label"] = object["label"] + + + + +print "Generating CSV files for neo loading..." + + +with open(exportFileTerms, 'w') as csvfile: + spamwriter = csv.writer(csvfile, delimiter=',', + quoting=csv.QUOTE_ALL, escapechar='\\',doublequote=False) + spamwriter.writerow(['identifier', "curie", "label","uri", "prefix" ]) + for key, term in terms.iteritems(): + label = None + uri = None + + try: + if term["label"] is not None: + label = term["label"].encode('utf-8', errors="ignore") + except: + pass + + if term["uri"] is not None: + uri = term["uri"] + + spamwriter.writerow( [term["id"], term["curie"], label, uri, term["prefix"] ]) + +with open(exportFileMappings, 'w') as csvfile: + spamwriter = csv.writer(csvfile, delimiter=',', + quoting=csv.QUOTE_ALL, escapechar='\\',doublequote=False) + spamwriter.writerow(['fromCurie', "toCurie","datasourcePrefix","datasource","sourceType","scope","date" ]) + for mapping in postMappings: + datasource = prefixToDatasource[mapping["datasourcePrefix"]] + spamwriter.writerow( [mapping["fromId"],mapping["toId"],mapping["datasourcePrefix"],json.dumps(datasource),mapping["sourceType"],mapping["scope"], datetime.datetime.now().strftime("%y-%m-%d")]) + +print "Generating CSV files for neo loading done, now loading them..." + +# CREATE CONSTRAINT ON (i:Term) ASSERT i.curie IS UNIQUE +# CREATE CONSTRAINT ON (i:Datasource) ASSERT i.prefix IS UNIQUE + + +def deleteMappings(): + result = session.run("match (t)-[m:MAPPING]->() WITH m LIMIT 50000 DETACH DELETE m RETURN count(*) as count") + for record in result: + return record["count"] +print "Deleting mappings..." +while deleteMappings() > 0: + print "Still deleting..." +print "Mappings deleted!" + +print "Deleting previous has_source" +def deleteSourceRels(): + result = session.run("match (t)-[m:HAS_SOURCE]->() WITH m LIMIT 50000 DETACH DELETE m RETURN count(*) as count") + for record in result: + return record["count"] +while deleteSourceRels() > 0: + print "Still deleting..." +print "Source rels deleted!" + +print "Deleting previous terms" +def deleteTerms(): + result = session.run("match (t:Term) WITH t LIMIT 50000 DETACH DELETE t RETURN count(*) as count") + for record in result: + return record["count"] +while deleteTerms() > 0: + print "Still deleting..." +print "Terms deleted!" + +print "Loading terms.csv..." + + + +loadTermsCypher = "USING PERIODIC COMMIT 10000 LOAD CSV WITH HEADERS FROM 'file:///"+exportFileTerms+"""' AS line + MATCH (d:Datasource {prefix : line.prefix}) + WITH d, line + MERGE (t:Term { id: line.identifier, curie: line.curie, label: line.label, uri: line.uri}) + with t,d + CREATE (t)-[:HAS_SOURCE]->(d)""" +result = session.run(loadTermsCypher) +print result.summary() + +print "Loading mappings.csv..." +loadMappingsCypher = "USING PERIODIC COMMIT 10000 LOAD CSV WITH HEADERS FROM 'file:///"+exportFileMappings+"""' AS line + MATCH (f:Term { curie: line.fromCurie}),(t:Term { curie: line.toCurie}) + WITH f,t,line + CREATE (f)-[m:MAPPING { sourcePrefix: line.datasourcePrefix, datasource: line.datasource, sourceType: line.sourceType, scope: line.scope, date: line.date}]->(t)""" + +result = session.run(loadMappingsCypher) +print result.summary() + +#After Loading, update indexes +print "updating indexes" +reply = urllib.urlopen(OXO.oxoUrl+"/api/search/rebuild?apikey="+OXO.apikey) +print "Finished process!" diff --git a/dataloading/oxo/OlsDatasetLoader.py b/dataloading/oxo/OlsDatasetLoader.py new file mode 100644 index 0000000..d73005f --- /dev/null +++ b/dataloading/oxo/OlsDatasetLoader.py @@ -0,0 +1,136 @@ +import urllib +import json +import xml.etree.ElementTree as ET +import yaml +import OxoClient as OXO +import csv +from ConfigParser import SafeConfigParser + +prefixToPreferred = {} +idorgNamespace = {} + +unprocessedIds = {} +termToIri = {} +termToLabel = {} + +#config.read(sys.argv[1]) +config = SafeConfigParser() +config.read("../config/oxo_dataRelease_config.ini") + +OXO.oxoUrl = config.get("Basics","oxoUrl") +OXO.apikey = config.get("Basics", "oxoAPIkey") +oboDbxrefUrl= config.get("Basics", "oboDbxrefUrl") + +olsurl=config.get("Basics", "olsurl") +olsurl=olsurl+"/ontologies?size=1000" + +idorgDataLocation = config.get("Paths", "idorgDataLocation") + +reply = urllib.urlopen(olsurl) +anwser = json.load(reply) + +ontologies = anwser["_embedded"]["ontologies"] + +for ontology in ontologies: + namespace = ontology["config"]["namespace"] + version = ontology["updated"] + + if namespace == 'ordo': + prefPrefix = 'Orphanet' + else: + prefPrefix = ontology["config"]["preferredPrefix"] + + title = ontology["config"]["title"] + desc = ontology["config"]["description"] + prefixToPreferred[prefPrefix.lower()] = prefPrefix + prefixToPreferred[namespace.lower()] = prefPrefix + + OXO.saveDatasource(prefPrefix, None, title, desc, "ONTOLOGY", None, [namespace], "https://creativecommons.org/licenses/by/4.0/", "Last updated in the ontology lookup service on "+version ) +# get namespaces from identifiers.org + +#urllib.urlopen('http://www.ebi.ac.uk/miriam/main/export/xml/') +tree = ET.ElementTree(file=idorgDataLocation) + +# from id.org default to namespace +# if no spaces in title, this is usally a better option +# unless a preferred prefix is provided, then always use that + +rootElem = tree.getroot() +for datatype in rootElem.findall('{http://www.biomodels.net/MIRIAM/}datatype'): + namespace = datatype.find('{http://www.biomodels.net/MIRIAM/}namespace').text + prefPrefix = namespace + + + title = datatype.find('{http://www.biomodels.net/MIRIAM/}name').text + desc = datatype.find('{http://www.biomodels.net/MIRIAM/}definition').text + licence = None + versionInfo = None + + altPrefixes = [namespace] + + if datatype.find('{http://www.biomodels.net/MIRIAM/}licence') is not None: + licence = datatype.find('{http://www.biomodels.net/MIRIAM/}licence').text + if datatype.find('{http://www.biomodels.net/MIRIAM/}versionInfo') is not None: + versionInfo = datatype.find('{http://www.biomodels.net/MIRIAM/}versionInfo').text + + if datatype.find('{http://www.biomodels.net/MIRIAM/}preferredPrefix') is not None: + prefPrefix = datatype.find('{http://www.biomodels.net/MIRIAM/}preferredPrefix').text + elif ' ' not in title: + prefPrefix = title + + # add titles to alt prefix if + if ' ' not in title: + altPrefixes.append(title) + + if datatype.find('{http://www.biomodels.net/MIRIAM/}alternatePrefixes') is not None: + for altPrefixs in datatype.find('{http://www.biomodels.net/MIRIAM/}alternatePrefixes'): + altPrefixes.append(altPrefixs.text) + + if prefPrefix.lower() in prefixToPreferred: + print "Ignoring "+namespace+" from idorg as it is already registered as a datasource" + elif namespace.lower() in prefixToPreferred: + print "Ignoring " + namespace + " from idorg as it is already registered as a datasource" + else: + idorgNamespace[prefPrefix.lower()] = prefPrefix + idorgNamespace[namespace.lower()] = prefPrefix + idorgNamespace[title.lower()] = prefPrefix + prefixToPreferred[prefPrefix.lower()] = prefPrefix + prefixToPreferred[namespace.lower()] = prefPrefix + prefixToPreferred[title.lower()] = prefPrefix + OXO.saveDatasource(prefPrefix, namespace, title, desc, "DATABASE", None, altPrefixes, licence, versionInfo) + + +#oboDbxrefUrl = 'https://raw.githubusercontent.com/geneontology/go-site/master/metadata/db-xrefs.yaml' +# Read from OBO db-xrefs +yamlData = yaml.load(urllib.urlopen(oboDbxrefUrl)) + +for database in yamlData: + namespace= database["database"] + title = database["name"] + prefPrefix = namespace + + altPrefixes = [namespace] + if namespace.lower() in prefixToPreferred: + print "Ignoring " + namespace + " from OBO as it is already registered as a datasource" + else: + urlSyntax = None + if "entity_types" in database: + if "url_syntax" in database["entity_types"][0]: + urlSyntax = database["entity_types"][0]["url_syntax"].replace("[example_id]", "") + prefixToPreferred[namespace.lower()] = prefPrefix + + OXO.saveDatasource(prefPrefix, None, title, None, "DATABASE",urlSyntax, altPrefixes, None, None) + + +# Create Paxo as datasources +print "Save paxo as datasource" +prefPrefix="paxo" +namespace=None +title="Paxo" +desc=None +sourceType="DATABASE" +urlSyntax=None +altPrefixes=["paxo"] +licence=None +versionInfo=0.1 +OXO.saveDatasource(prefPrefix, namespace, title, desc, sourceType, urlSyntax, altPrefixes, licence, versionInfo) diff --git a/dataloading/oxo/OxoClient.py b/dataloading/oxo/OxoClient.py new file mode 100644 index 0000000..fc05b2f --- /dev/null +++ b/dataloading/oxo/OxoClient.py @@ -0,0 +1,173 @@ +import urllib +import requests +import json +from ConfigParser import SafeConfigParser + +def saveDatasource (prefix, idorgNamespace, title, description, sourceType, baseUri, alternatePrefixes, licence, versionInfo): + #print "saving new datasource: {},{},{},{},{},{}".format(prefix, idorgNamespace, title, sourceType, baseUri, alternatePrefixes) + if not baseUri: + baseUri = [] + else: + baseUri = [baseUri] + + postdata = { + "prefix": prefix, + "idorgNamespace": idorgNamespace, + "alternatePrefix": alternatePrefixes, + "alternateIris": baseUri, + "name": title, + "description": description, + "licence": licence, + "versionInfo": versionInfo, + "source": sourceType + } + + url = oxoUrl+"/api/datasources?apikey="+apikey + headers = {'Content-type': 'application/json', 'Accept': 'application/json'} + #print url + r = requests.post(url, data=json.dumps(postdata), headers=headers) + if r.status_code != requests.codes.ok: + print r + print r.text + print r.status_code + print json.loads(r.text)["message"] + +def saveMapping(fromPrefix, fromId, fromLabel, fromUri, mappingSourceId, toPrefix, toId, sourceType): + + fromCurie = fromPrefix+":"+fromId + toCurie = toPrefix+":"+toId + + url = oxoUrl+"/api/mappings?apikey="+apikey + headers = {'Content-type': 'application/json', 'Accept': 'application/json'} + + postdata = { + "fromId": fromCurie, + "toId": toCurie, + "datasourcePrefix": mappingSourceId, + "sourceType": sourceType, + "scope": "RELATED" + } + # print "saving new mapping: {} -> {} -> {}".format(fromCurie, mappingSourceId, toCurie) + + r = requests.post(url, data=json.dumps(postdata), headers=headers) + + if r.status_code != requests.codes.ok: + print json.loads(r.text)["message"] + +def saveMappings(mappings): + + url = oxoUrl+"/api/mappings?apikey="+apikey + headers = {'Content-type': 'application/json', 'Accept': 'application/json'} + + # print "saving new mapping: {} -> {} -> {}".format(fromCurie, mappingSourceId, toCurie) + + r = requests.post(url, data=json.dumps(mappings), headers=headers) + + if r.status_code != requests.codes.ok: + print json.loads(r.text)["message"] + +def updateTerm(curie, iri, label): + # if iri and label then patch + headers = {'Content-type': 'application/json', 'Accept': 'application/json'} + + params=None + if iri and label: + params = "uri=" + urllib.quote_plus(iri.encode('utf-8'))+"&label=" + urllib.quote_plus(label.encode('utf-8')) + if not iri and not label: + iriMap = getIriAndLabelFromOls(curie) + if iriMap: + olsIri = iriMap["uri"] + if olsIri: + params = "uri=" + urllib.quote_plus(olsIri.encode('utf-8')) + olsLabel = iriMap["label"] + if olsLabel: + params = "uri=" + urllib.quote_plus(olsIri.encode('utf-8')) + "&label=" + urllib.quote_plus(olsLabel.encode('utf-8')) + if not iri and label: + params = "label=" + urllib.quote_plus(label.encode('utf-8')) + + + if params: + url = oxoUrl+"/api/terms/" + curie + "?" + params + "&apikey="+apikey + r = requests.patch(url, data=None, headers=headers) + return r.status_code == requests.codes.ok + return False + + +olsLabel = {} +olsIri = {} +def getIriAndLabelFromOls(curie, olsurl): + if curie in olsIri: + return {"uri" : olsIri[curie], "label": olsLabel[curie]} + else: + olsurl = olsurl+"/terms?obo_id="+curie + reply = urllib.urlopen(olsurl) + if reply.getcode() == 200: + anwser = json.load(reply) + if "_embedded" in anwser.keys(): + terms = anwser["_embedded"]["terms"] + label = None + uri = None + for term in terms: + label = term["label"] + uri = term["iri"] + is_defining_ontology = term["is_defining_ontology"] + if is_defining_ontology: + olsLabel[curie] = label + olsIri[curie] = uri + return {'uri': uri, 'label': label} + olsLabel[curie] = label + olsIri[curie] = uri + return {'uri': uri, 'label': label} + return None + +def getLabelFromOls(curie): + + if olsLabel[curie]: + return olsLabel[curie] + else: + olsurl = olsurl+"/terms?obo_id="+curie + reply = urllib.urlopen(olsurl) + if reply.getcode() == 200: + anwser = json.load(reply) + if "_embedded" in anwser.keys(): + terms = anwser["_embedded"]["terms"] + label = None + uri = None + for term in terms: + label = term["label"] + uri = term["iri"] + is_defining_ontology = term["is_defining_ontology"] + if is_defining_ontology: + olsLabel[curie] = label + olsIri[curie] = uri + return label + olsLabel[curie] = label + olsIri[curie] = uri + return label + return None + +def getOxODatasets(): + url = oxoUrl + "/api/datasources?size=4000" + reply = urllib.urlopen(url) + anwser = json.load(reply) + return anwser["_embedded"]["datasources"] + +def getPrefixFromCui (id): + + if ":" in id and len(id.split(":")) == 2: + return id.split(":")[0] + + if "_" in id and len(id.split("_")) == 2: + return id.split("_")[0] + + return None + +def getIdFromCui (id): + + if ":" in id and len(id.split(":")) == 2: + return id.split(":")[1] + + if "_" in id and len(id.split("_")) == 2: + return id.split("_")[1] + + return None diff --git a/paxo/clientOperations.py b/dataloading/paxo/clientOperations.py similarity index 56% rename from paxo/clientOperations.py rename to dataloading/paxo/clientOperations.py index 9c226dc..464fc7e 100644 --- a/paxo/clientOperations.py +++ b/dataloading/paxo/clientOperations.py @@ -7,23 +7,29 @@ import json from ConfigParser import SafeConfigParser import ast +import neoExporter +import sys #Compares to ontologies from the OLS. This process can take a while and procudes a csv with primary results def scoreOntologies(sourceOntology, targetOntology, scoreParams, scoringtargetFolder): logging.info("Start scoring "+sourceOntology+" and "+targetOntology) #Check for the smaller ontology - url=config.get("Basics","olsAPIURL") + + olsURL=config.get("Basics","olsAPIURL") + oxoURL=config.get("Basics","oxoURL") + urls={"ols":olsURL, "oxo":oxoURL} + try: - r = requests.get(url+"ontologies/"+sourceOntology) + r = requests.get(olsURL+"ontologies/"+sourceOntology) numberOfTerms=r.json()['numberOfTerms'] - r = requests.get(url+"ontologies/"+targetOntology) + r = requests.get(olsURL+"ontologies/"+targetOntology) numberOfTerms2 = r.json()['numberOfTerms'] except: logging.error("Error getting number of terms throw webservice call!") - logging.error(url+"ontologies/"+sourceOntology) - logging.error(url+"ontologies/"+targetOntology) + logging.error(olsURL+"ontologies/"+sourceOntology) + logging.error(olsURL+"ontologies/"+targetOntology) logging.error(r) raise @@ -33,7 +39,7 @@ def scoreOntologies(sourceOntology, targetOntology, scoreParams, scoringtargetFo sourceOntology=targetOntology targetOntology=tmpOntology - termsUrl=url+"ontologies/"+sourceOntology+"/terms?size=5&fieldList=iri,label,synonym" + termsUrl=olsURL+"ontologies/"+sourceOntology+"/terms?size=10&fieldList=iri,label,synonym" results=[] results.append(["sourceLabel","sourceIRI", "fuzzy", "oxo", "synFuzzy", "synOxo", "bridgeTerms"]) @@ -55,9 +61,8 @@ def scoreOntologies(sourceOntology, targetOntology, scoreParams, scoringtargetFo #Check if the term is actually defined in that ontology if term['is_defining_ontology'] is True: - pscore=flaskMapping.scoreTermOLS(term["iri"], originalLabel, targetOntology, scoreParams) + pscore=flaskMapping.scoreTermOLS(term["iri"], originalLabel, targetOntology, scoreParams, urls) try: - calculatedMappings=flaskMapping.processPScore(pscore) except Exception as e: print "Exception in primary Scoring" @@ -74,7 +79,7 @@ def scoreOntologies(sourceOntology, targetOntology, scoreParams, scoringtargetFo if synonyms!=None: for synonym in synonyms: try: - synPscore=flaskMapping.primaryScoreTerm('', synonym, targetOntology, scoreParams) + synPscore=flaskMapping.primaryScoreTerm('', synonym, targetOntology, scoreParams, urls) synCalculatedMappings=flaskMapping.processPScore(synPscore) #Process the primaryScore for synonyms synCalculatedMappings['sourceIRI']=term["iri"] except Exception as e: @@ -96,12 +101,11 @@ def scoreOntologies(sourceOntology, targetOntology, scoreParams, scoringtargetFo try: termsUrl=r.json()['_links']['next']['href'] - ###This is just temporary to not process all the stuff but abort after two pages counter=counter+1 if counter%2==0: print "Processed "+str(counter)+" pages" logging.info("Processed "+str(counter)+" pages") - break # Not necessary if we want to parse whole ontology, just activate this for testing + #break #Uncomment this for testing (to not parse the whole ontology) except: logging.info("Reached last page I recon") print "Reached last page I recon" @@ -234,9 +238,9 @@ def curationOntologyFinalScore(scoredMatrix): print "A total of "+str(doubleEntryCounter)+" processed!" - #if len(replacedList)!=0: - # print "Write file of replaced terms now" - # print "Total length of replaced is "+str(len(replacedList)) + if len(replacedList)!=0: + print "Write file of replaced terms now" + print "Total length of replaced is "+str(len(replacedList)) # with open('pipeline_output/replaced_terms.csv', 'wb') as f: # writer = csv.writer(f) # writer.writerows(replacedList) @@ -258,10 +262,10 @@ def calculatePrimaryScore(combinedOntologyName, params, scoringTargetFolder, wri return preparedScoredMatrix #Calculates a score and Validates it against a standard for a pair of ontologies -def calculateAndValidateOntologyPrimaryScore(onto1, onto2, stdName, stdFile, params, scoringTargetFolder, writeToDisc,predictedTargetFolder, parseParms, curationOfDoubleEntries, validationTargetFolder): +def calculateAndValidateOntologyPrimaryScore(onto1, onto2, stdName, stdFile, params, scoringTargetFolder, writeToDisc,predictedTargetFolder, parseParms, curationOfDoubleEntries, validationTargetFolder, url): combinedOntologyName=onto1+"_"+onto2 preparedScoredMatrix=calculatePrimaryScore(combinedOntologyName, params, scoringTargetFolder, writeToDisc, predictedTargetFolder,curationOfDoubleEntries) - validationResult=validation.validateFinaleScore(onto1, onto2, stdName, preparedScoredMatrix, stdFile, writeToDisc, params, parseParms, validationTargetFolder) + validationResult=validation.validateFinaleScore(onto1, onto2, stdName, preparedScoredMatrix, stdFile, writeToDisc, params, parseParms, validationTargetFolder, url) return validationResult #Goes through the sections and calls scoreOntologies for every section @@ -316,7 +320,7 @@ def calculateAndValidateListOntologies(sections, writeToDiscFlag, curationOfDoub params={"fuzzyUpperLimit": fuzzyUpperLimit, "fuzzyLowerLimit": fuzzyLowerLimit,"fuzzyUpperFactor": fuzzyUpperFactor,"fuzzyLowerFactor":fuzzyLowerFactor, "oxoDistanceOne":oxoDistanceOne, "oxoDistanceTwo":oxoDistanceTwo, "oxoDistanceThree":oxoDistanceThree, "synFuzzyFactor":synFuzzyFactor, "synOxoFactor": synOxoFactor, "bridgeOxoFactor":bridgeOxoFactor, "threshold":threshold} print "Validate "+sourceOntology+" "+targetOntology+" "+name - print calculateAndValidateOntologyPrimaryScore(sourceOntology, targetOntology, name, stdFile, params, scoringtargetFolder, writeToDiscFlag, predictedTargetFolder, parseParms, curationOfDoubleEntries,validationTargetFolder) + print calculateAndValidateOntologyPrimaryScore(sourceOntology, targetOntology, name, stdFile, params, scoringtargetFolder, writeToDiscFlag, predictedTargetFolder, parseParms, curationOfDoubleEntries,validationTargetFolder,config.get('Basics','olsAPIURL')+"search") #Goes through the sections and calls calculateOntologyPrimaryScore for every section @@ -345,93 +349,155 @@ def calculateListOntologies(sections, writeToDisc, curationOfDoubleEntries): print predictedTargetFolder print "Calculate "+sourceOntology+" "+targetOntology print scoringTargetFolder - return calculatePrimaryScore(sourceOntology+"_"+targetOntology, params, scoringTargetFolder, writeToDisc, predictedTargetFolder, curationOfDoubleEntries) - - -config = SafeConfigParser() -config.read("config.ini") -logFile=config.get("Basics","logFile") -logging.basicConfig(filename=logFile, level=logging.INFO, format='%(asctime)s - %(message)s') - -#writeToDiscFlag=config.get("Basics", ...) -writeToDiscFlag=False -uniqueMaps=True - -sections=config.sections()[2:] - -#Control via config file -#scoreListOntologies(sections) -#calculateListOntologies(sections, writeToDiscFlag, uniqueMaps) -#calculateAndValidateListOntologies(sections, writeToDiscFlag, uniqueMaps) - -#removeStopwordsList=['of', 'the'] -#replaceTermList=[('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('abnormality','disease')] -#scoreParams={"removeStopwordsList": removeStopwordsList, "replaceTermList" :replaceTermList} -#hp_doid_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : [('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('abnormality','disease')]} - -### Primary score ontologies -#ordo_hp_scoreParams={"removeStopwordsList": ['of', 'the', 'Rare'], "replaceTermList" : [('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('tumor', 'neoplasm'), ('tumor','cancer'), ('abnormality', 'disease'), ('decreased', 'reduced'), ('morphology', '')]} -#scoreOntologies("ordo","hp", ordo_hp_scoreParams, 'final_dec/scoring/') -# -#doid_mp_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} -#scoreOntologies("doid","mp", doid_mp_scoreParams, 'final_dec/scoring/') -# # -# doid_ordo_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} -# scoreOntologies("doid","ordo", doid_ordo_scoreParams, 'final_dec/scoring/') -# # -# hp_doid_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : [('cancer', 'neoplasm'), ('cancer','carcinoma'), ('abnormality','disease'), 'abnormality','disease']} -# scoreOntologies("hp","doid",hp_doid_scoreParams, 'final_dec/scoring/') -# # -# hp_mp_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : [('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('abnormality','disease'), ('abnormal','Abnormality')]} -# scoreOntologies("hp","mp", hp_mp_scoreParams, 'final_dec/scoring/') -# # -# ordo_mp_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} -# scoreOntologies("ordo","mp", ordo_mp_scoreParams, 'final_dec/scoring/') - - -#mesh_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} -#scoreOntologies("mesh","hp", mesh_scoreParams, 'final_dec/scoring/') -#scoreOntologies("mesh","doid", mesh_scoreParams, 'final_dec/scoring/') -#scoreOntologies("mesh","ordo", mesh_scoreParams, 'final_dec/scoring/') -#scoreOntologies("mesh","mp", mesh_scoreParams, 'final_dec/scoring/') - - - -#Could/Should be changed so parameters come from the config file -params={"fuzzyUpperLimit": 0.8, "fuzzyLowerLimit": 0.6,"fuzzyUpperFactor": 1,"fuzzyLowerFactor":0.6, "oxoDistanceOne":1, "oxoDistanceTwo":0.3, "oxoDistanceThree":0.1, "synFuzzyFactor":0.6, "synOxoFactor": 0.4, "bridgeOxoFactor":1, "threshold":0.6} -#params={"fuzzyUpperLimit": 0.8, "fuzzyLowerLimit": 0.6,"fuzzyUpperFactor": 1,"fuzzyLowerFactor":0.6, "oxoDistanceOne":1, "oxoDistanceTwo":0.3, "oxoDistanceThree":0.1, "synFuzzyFactor":0.6, "synOxoFactor": 0.4, "bridgeOxoFactor":1, "threshold":0.8} - -### Execute Calculate and validate for a certain file -#print calculateAndValidateOntologyPrimaryScore('hp', 'doid', 'loom', 'Loom/DOID_HP_loom.csv', params, 'final_dec/scoring/', writeToDiscFlag, 'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') -#print calculateAndValidateOntologyPrimaryScore('hp','doid', 'silver','silver_nov/Consensus-3-hp-doid.tsv', params, 'final_dec/scoring/', writeToDiscFlag, 'final_dec/predicted/',{'uri1':0, 'uri2':2, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') -#print calculateAndValidateOntologyPrimaryScore('ordo', 'hp', 'loom', 'Loom/ordo_hp_loom.csv', params,'final_dec/scoring/', writeToDisc, final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') - - -#params={"fuzzyUpperLimit": 0, "fuzzyLowerLimit": 0,"fuzzyUpperFactor": 0.65, "fuzzyLowerFactor":0, "oxoDistanceOne":0.00029, "oxoDistanceTwo":0.57, "oxoDistanceThree":0.027, "synFuzzyFactor":0.247, "synOxoFactor": 0.62, "bridgeOxoFactor":0.829, "threshold":0.6} - -print calculateAndValidateOntologyPrimaryScore('ordo', 'hp', 'silver','silver_nov/Consensus-3-hp-ordo.tsv', params,'final_dec/scoring/', writeToDiscFlag, 'final_dec/predicted/',{'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') -# {'misses': 210, 'alternatives': 350} - -# # -# print calculateAndValidateOntologyPrimaryScore('mp','hp', 'loom','Loom/MP_HP_loom.csv', params,'final_dec/scoring/', writeToDiscFag, 'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2 , 'delimiter':','}, uniqueMaps, 'final_dec/validation/') -print calculateAndValidateOntologyPrimaryScore('mp','hp', 'silver','silver_nov/Consensus-3-hp-mp.tsv', params, 'final_dec/scoring/',writeToDiscFlag, 'final_dec/predicted/', {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') -# print calculateAndValidateOntologyPrimaryScore('ordo','doid', 'loom' ,'Loom/DOID_ORDO_loom.csv', params, 'final_dec/scoring/',writeToDisc, 'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') -print calculateAndValidateOntologyPrimaryScore('ordo','doid', 'silver','silver_nov/Consensus-3-doid-ordo.tsv', params, 'final_dec/scoring/', writeToDiscFlag,'final_dec/predicted/', {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') -# print calculateAndValidateOntologyPrimaryScore('ordo','mp', 'loom', 'Loom/mp_ordo_loom.csv', params,'final_dec/scoring/', writeToDiscFlag, 'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') -print calculateAndValidateOntologyPrimaryScore('ordo','mp', 'silver','silver_nov/Consensus-3-mp-ordo.tsv', params,'final_dec/scoring/', writeToDiscFlag, 'final_dec/predicted/', {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') -#print calculateAndValidateOntologyPrimaryScore('mp','doid', 'loom', 'Loom/DOID_MP_loom.csv', params, 'final_dec/scoring/',writeToDiscFlag, 'final_dec/predicted/', {'uri1':1, 'uri2':0, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') -print calculateAndValidateOntologyPrimaryScore('mp','doid', 'silver','silver_nov/Consensus-3-mp-doid.tsv', params, 'final_dec/scoring/',writeToDiscFlag,'final_dec/predicted/', {'uri1':0, 'uri2':2, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') -# # -# - -# print calculateAndValidateOntologyPrimaryScore('mesh','doid', 'loom', 'Loom/DOID_MESH_loom_new.csv', params, 'final_dec/scoring/',writeToDisc,'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') -# print calculateAndValidateOntologyPrimaryScore('mesh','doid', 'silver', 'silver_nov/Consensus-3-doid-mesh3.tsv', 'final_dec/scoring/',params, writeToDisc, 'final_dec/predicted/', {'uri1':2, 'uri2':0, 'scorePosition':2, 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') -# print calculateAndValidateOntologyPrimaryScore('mesh','hp', 'loom', 'Loom/mesh_hp_loom_new.csv', params, 'final_dec/scoring/',writeToDisc,'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') -# print calculateAndValidateOntologyPrimaryScore('mesh','hp', 'silver', 'silver_nov/Consensus-3-hp-mesh3.tsv', params, 'final_dec/scoring/',writeToDisc,'final_dec/predicted/', {'uri1':1, 'uri2':0, 'scorePosition':2, 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') -# print calculateAndValidateOntologyPrimaryScore('mesh','mp', 'loom', 'Loom/mesh_mp_loom_new.csv', params, 'final_dec/scoring/',writeToDisc,'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') -# print calculateAndValidateOntologyPrimaryScore('mesh','mp', 'silver', 'silver_nov/Consensus-3-mp-mesh3.tsv', params,'final_dec/scoring/', writeToDisc, 'final_dec/predicted/', {'uri1':1, 'uri2':0, 'scorePosition':2, 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') - - -###Execute functions for terms (#Broken since last change?) -#scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} -#print flaskMapping.scoreTermLabel("Nuclear cataract", "doid", scoreParams, params) + print calculatePrimaryScore(sourceOntology+"_"+targetOntology, params, scoringTargetFolder, writeToDisc, predictedTargetFolder, curationOfDoubleEntries) + + +def exportNeoList(sections): + for section in sections: + sourceOntology=config.get(section, 'sourceOntology') + targetOntology=config.get(section, 'targetOntology') + predictedFolder=config.get('Params','predictedTargetFolder') + targetFolder=config.get('Params','neoFolder') + + olsURL=config.get('Basics', 'olsAPIURL') + neoURL=config.get('Basics','neoURL') + neoUser=config.get('Basics','neoUser') + neoPW=config.get('Basics','neoPW') + + neoExporter.exportInNeo(sourceOntology, targetOntology, predictedFolder, targetFolder, olsURL, neoURL, neoUser, neoPW) + + print "Completed neo4J export" + + + + +#Here main starts + +##First definition of two global variables +replacementTerms={ +"ordo_hp" : [('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('tumor', 'neoplasm'), ('tumor','cancer'), ('abnormality', 'disease'), ('decreased', 'reduced'), ('morphology', '')], +"doid_mp" : [], +"doid_ordo" :[], +"hp_doid" : [('cancer', 'neoplasm'), ('cancer','carcinoma'), ('abnormality','disease'), 'abnormality','disease'], +"hp_mp" : [('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('abnormality','disease'), ('abnormal','Abnormality')], +"ordo_mp" : [] +} + +helptext="""Start the client with exactly two input parameters: The path to the config file and one of the following flags: + -s: Score a list of ontology mappings. This creates a primary raw score + -c: Calculate from the primary raw score a predicted score + -cv: Calculate a predicted score but also validate it against given standard files. + -n: Reads in a predicted score file and exports it to a neo4j compatible format + + example: python clientOperations.py config.ini -s + """ +#Parse the input parameters. A config file and a flag is expected +if len(sys.argv)<3: + print helptext + print "\nNot enough arguments! Take exactly two, "+str(len(sys.argv)-1)+" given!" +elif len(sys.argv)>3: + print helptext + print "\nToo many arguments! Take exactly two, "+str(len(sys.argv)-1)+" given!" +else: + + config = SafeConfigParser() + config.read(sys.argv[1]) + + logFile=config.get("Basics","logFile") + logging.basicConfig(filename=logFile, level=logging.INFO, format='%(asctime)s - %(message)s') + + #writeToDiscFlag=config.get("Basics", ...) + #writeToDiscFlag=True #Also out of config file? + #uniqueMaps=True #Also out of config file + writeToDiscFlag=config.getboolean("Params","writeToDiscFlag") + uniqueMaps=config.getboolean("Params","uniqueMaps") + + #Throw away the first 2 sections and take only the actual mapping part of the config into account + sections=config.sections()[2:] + + if sys.argv[2]=="-s": + scoreListOntologies(sections) + elif sys.argv[2]=="-c": + calculateListOntologies(sections, writeToDiscFlag, uniqueMaps) + elif sys.argv[2]=="-cv": + calculateAndValidateListOntologies(sections, writeToDiscFlag, uniqueMaps) + elif sys.argv[2]=="-n": + exportNeoList(sections) + else: + print "Could not recognize option. So I execute what's uncommented in the else branch. This should just be during development" + #removeStopwordsList=['of', 'the'] + #replaceTermList=[('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('abnormality','disease')] + #scoreParams={"removeStopwordsList": removeStopwordsList, "replaceTermList" :replaceTermList} + #hp_doid_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : [('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('abnormality','disease')]} + + + ### Primary score ontologies + #ordo_hp_scoreParams={"removeStopwordsList": ['of', 'the', 'Rare'], "replaceTermList" : [('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('tumor', 'neoplasm'), ('tumor','cancer'), ('abnormality', 'disease'), ('decreased', 'reduced'), ('morphology', '')]} + #scoreOntologies("ordo","hp", ordo_hp_scoreParams, 'final_dec/scoring/') + # + #doid_mp_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} + #scoreOntologies("doid","mp", doid_mp_scoreParams, 'final_dec/scoring/') + # # + # doid_ordo_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} + # scoreOntologies("doid","ordo", doid_ordo_scoreParams, 'final_dec/scoring/') + # # + # hp_doid_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : [('cancer', 'neoplasm'), ('cancer','carcinoma'), ('abnormality','disease'), 'abnormality','disease']} + # scoreOntologies("hp","doid",hp_doid_scoreParams, 'final_dec/scoring/') + # # + # hp_mp_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : [('cancer', 'carcinom'), ('cancer', 'neoplasm'), ('cancer','carcinoma'),('abnormality','disease'), ('abnormal','Abnormality')]} + # scoreOntologies("hp","mp", hp_mp_scoreParams, 'final_dec/scoring/') + # # + # ordo_mp_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} + # scoreOntologies("ordo","mp", ordo_mp_scoreParams, 'final_dec/scoring/') + + + #mesh_scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} + #scoreOntologies("mesh","hp", mesh_scoreParams, 'final_dec/scoring/') + #scoreOntologies("mesh","doid", mesh_scoreParams, 'final_dec/scoring/') + #scoreOntologies("mesh","ordo", mesh_scoreParams, 'final_dec/scoring/') + #scoreOntologies("mesh","mp", mesh_scoreParams, 'final_dec/scoring/') + + + + #Could/Should be changed so parameters come from the config file + params={"fuzzyUpperLimit": 0.8, "fuzzyLowerLimit": 0.6,"fuzzyUpperFactor": 1,"fuzzyLowerFactor":0.6, "oxoDistanceOne":1, "oxoDistanceTwo":0.3, "oxoDistanceThree":0.1, "synFuzzyFactor":0.6, "synOxoFactor": 0.4, "bridgeOxoFactor":1, "threshold":0.6} + #params={"fuzzyUpperLimit": 0.8, "fuzzyLowerLimit": 0.6,"fuzzyUpperFactor": 1,"fuzzyLowerFactor":0.6, "oxoDistanceOne":1, "oxoDistanceTwo":0.3, "oxoDistanceThree":0.1, "synFuzzyFactor":0.6, "synOxoFactor": 0.4, "bridgeOxoFactor":1, "threshold":0.8} + + ### Execute Calculate and validate for a certain file + #print calculateAndValidateOntologyPrimaryScore('hp', 'doid', 'loom', 'Loom/DOID_HP_loom.csv', params, 'final_dec/scoring/', writeToDiscFlag, 'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') + #print calculateAndValidateOntologyPrimaryScore('hp','doid', 'silver','silver_nov/Consensus-3-hp-doid.tsv', params, 'final_dec/scoring/', writeToDiscFlag, 'final_dec/predicted/',{'uri1':0, 'uri2':2, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') + #print calculateAndValidateOntologyPrimaryScore('ordo', 'hp', 'loom', 'Loom/ordo_hp_loom.csv', params,'final_dec/scoring/', writeToDisc, final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') + + + #params={"fuzzyUpperLimit": 0, "fuzzyLowerLimit": 0,"fuzzyUpperFactor": 0.65, "fuzzyLowerFactor":0, "oxoDistanceOne":0.00029, "oxoDistanceTwo":0.57, "oxoDistanceThree":0.027, "synFuzzyFactor":0.247, "synOxoFactor": 0.62, "bridgeOxoFactor":0.829, "threshold":0.6} + + #print calculateAndValidateOntologyPrimaryScore('ordo', 'hp', 'silver','silver_nov/Consensus-3-hp-ordo.tsv', params,'final_dec/scoring/', writeToDiscFlag, 'final_dec/predicted/',{'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') + # {'misses': 210, 'alternatives': 350} + + # # + # print calculateAndValidateOntologyPrimaryScore('mp','hp', 'loom','Loom/MP_HP_loom.csv', params,'final_dec/scoring/', writeToDiscFag, 'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2 , 'delimiter':','}, uniqueMaps, 'final_dec/evaluation/') + #print calculateAndValidateOntologyPrimaryScore('mp','hp', 'silver','silver_nov/Consensus-3-hp-mp.tsv', params, 'final_dec/scoring/',writeToDiscFlag, 'final_dec/predicted/', {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps, 'final_dec/evaluation/') + # print calculateAndValidateOntologyPrimaryScore('ordo','doid', 'loom' ,'Loom/DOID_ORDO_loom.csv', params, 'final_dec/scoring/',writeToDisc, 'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/evaluation/') + #print calculateAndValidateOntologyPrimaryScore('ordo','doid', 'silver','silver_nov/Consensus-3-doid-ordo.tsv', params, 'final_dec/scoring/', writeToDiscFlag,'final_dec/predicted/', {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps, 'final_dec/evaluation/') + # print calculateAndValidateOntologyPrimaryScore('ordo','mp', 'loom', 'Loom/mp_ordo_loom.csv', params,'final_dec/scoring/', writeToDiscFlag, 'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/evaluation/') + #print calculateAndValidateOntologyPrimaryScore('ordo','mp', 'silver','silver_nov/Consensus-3-mp-ordo.tsv', params,'final_dec/scoring/', writeToDiscFlag, 'final_dec/predicted/', {'uri1':2, 'uri2':0, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps, 'final_dec/evaluation/') + #print calculateAndValidateOntologyPrimaryScore('mp','doid', 'loom', 'Loom/DOID_MP_loom.csv', params, 'final_dec/scoring/',writeToDiscFlag, 'final_dec/predicted/', {'uri1':1, 'uri2':0, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/evaluation/') + #print calculateAndValidateOntologyPrimaryScore('mp','doid', 'silver','silver_nov/Consensus-3-mp-doid.tsv', params, 'final_dec/scoring/',writeToDiscFlag,'final_dec/predicted/', {'uri1':0, 'uri2':2, 'scorePosition':4 , 'delimiter':'\t'}, uniqueMaps, 'final_dec/evaluation/') + # # + # + + # print calculateAndValidateOntologyPrimaryScore('mesh','doid', 'loom', 'Loom/DOID_MESH_loom_new.csv', params, 'final_dec/scoring/',writeToDisc,'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') + # print calculateAndValidateOntologyPrimaryScore('mesh','doid', 'silver', 'silver_nov/Consensus-3-doid-mesh3.tsv', 'final_dec/scoring/',params, writeToDisc, 'final_dec/predicted/', {'uri1':2, 'uri2':0, 'scorePosition':2, 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') + # print calculateAndValidateOntologyPrimaryScore('mesh','hp', 'loom', 'Loom/mesh_hp_loom_new.csv', params, 'final_dec/scoring/',writeToDisc,'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') + # print calculateAndValidateOntologyPrimaryScore('mesh','hp', 'silver', 'silver_nov/Consensus-3-hp-mesh3.tsv', params, 'final_dec/scoring/',writeToDisc,'final_dec/predicted/', {'uri1':1, 'uri2':0, 'scorePosition':2, 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') + # print calculateAndValidateOntologyPrimaryScore('mesh','mp', 'loom', 'Loom/mesh_mp_loom_new.csv', params, 'final_dec/scoring/',writeToDisc,'final_dec/predicted/', {'uri1':0, 'uri2':1, 'scorePosition':2, 'delimiter':','}, uniqueMaps, 'final_dec/validation/') + # print calculateAndValidateOntologyPrimaryScore('mesh','mp', 'silver', 'silver_nov/Consensus-3-mp-mesh3.tsv', params,'final_dec/scoring/', writeToDisc, 'final_dec/predicted/', {'uri1':1, 'uri2':0, 'scorePosition':2, 'delimiter':'\t'}, uniqueMaps, 'final_dec/validation/') + + + #Just run calculate without validation + #calculatePrimaryScore('ordo'+"_"+'doid', params, 'final_dec/scoring/', writeToDiscFlag, 'final_dec/predicted/', uniqueMaps) + + + ###Execute functions for terms + #scoreParams={"removeStopwordsList": ['of', 'the'], "replaceTermList" : []} + #print flaskMapping.scoreTermLabel("Nuclear cataract", "doid", scoreParams, params) diff --git a/paxo/flaskMapping.py b/dataloading/paxo/flaskMapping.py similarity index 87% rename from paxo/flaskMapping.py rename to dataloading/paxo/flaskMapping.py index 8034d12..8c8a5d0 100644 --- a/paxo/flaskMapping.py +++ b/dataloading/paxo/flaskMapping.py @@ -3,24 +3,13 @@ import requests import Levenshtein -from ConfigParser import SafeConfigParser +#from ConfigParser import SafeConfigParser from flask import Flask from flask import request from flask import jsonify app = Flask(__name__) - -config = SafeConfigParser() -config.read("config.ini") - -searchURL=config.get("Basics","olsURL") -oxoURL=config.get("Basics","oxoURL") -logFile=config.get("Basics","logFile") - -logging.basicConfig(filename=logFile, level=logging.INFO, format='%(asctime)s - %(message)s') - - @app.route("/") def hello(): return "Hello World!" @@ -33,35 +22,41 @@ def jsonScoreTermLabel(): scoredTerm=scoreTermLabel(label, targetOntology, {}) return jsonify(scoredTerm[0]) -#GeneralFunction to do webservice calls via requests. Retries 3 times in case of failure before it fails for good +#General function to do webservice calls via requests. Retries 3 times in case of failure before it fails for good def apiCall(url, data): try: r = requests.get(url, data) except: - time.sleep(5) - logging.info("API exception, try again after 5 second delay") + time.sleep(10) + logging.info("API exception, try again after 10 second delay") try: - r = requests.get(searchURL, data) - logging.info("Success") + r = requests.get(url, data) + logging.info("Success after 10 seconds") except: - time.sleep(45) - logging.info("API exception failed, again - last try, now after addtional 30 seconds delay!") + logging.info("API exception failed, again - try, now after addtional 120 seconds delay!") + time.sleep(120) try: - r = requests.get(searchURL, data) - logging.info("Success") + r = requests.get(url, data) + logging.info("Success after 120 seconds") except: - logging.info("Last try failed as well, abort. Total of 3 tries failed, so I let the whole process fail") - logging.info(url) - logging.info(data) - logging.info(r.status_code) - logging.info(r.request.url) - raise + logging.info("API exception failed, again - try, now after addtional 120 seconds delay!") + time.sleep(300) + try: + r = requests.get(url, data) + logging.info("Success after 120 seconds") + except: + logging.info("Last try failed as well, abort. Total of 4 tries failed, so I let the whole process fail") + logging.info(url) + logging.info(data) + logging.info(r.status_code) + logging.info(r.request.url) + raise return r #Takes an input label and executes the oxo call -def oxoMatch(termLabel, targetOntology): +def oxoMatch(termLabel, targetOntology, url): data={"ids":termLabel, "mappingTarget":targetOntology, "distance":3} #Maybe include also 'querySource=' parameters - jsonReply=apiCall(oxoURL, data) + jsonReply=apiCall(url+"search", data) try: jsonReply=jsonReply.json()['_embedded']['searchResults'][0] tmpList=[] @@ -70,10 +65,10 @@ def oxoMatch(termLabel, targetOntology): for row in jsonReply['mappingResponseList']: ##Additional webservice call to get the stupid long IRI out of oxo - oxoMapURL="https://www.ebi.ac.uk/spot/oxo/api/mappings" + #oxoMapURL="https://www.ebi.ac.uk/spot/oxo/api/mappings" data={"fromId":row['curie']} - longId=apiCall(oxoMapURL, data) - longId=longId.json()['_embedded']['mappings'][0]['fromTerm']['uri'] ## + longId=apiCall(url+"mappings", data) + longId=longId.json()['_embedded']['mappings'][0]['fromTerm']['uri'] tmpList.append({"curie":longId, "distance":row['distance']}) #tmpList.append({"curie":row['curie'], "distance":row['distance']}) sortedCurie=sorted(tmpList, key=lambda tmpList: tmpList['distance'], reverse=False) @@ -181,25 +176,23 @@ def stringMatcher(sourceTerm, targetTerm, replaceTermList, removeStopwordsList): - - - - - - #Takes an input label and executes the fuzzyOLS call -def olsFuzzyMatch(termLabel, targetOntology, replaceTermList, removeStopwordsList): +def olsFuzzyMatch(termLabel, targetOntology, replaceTermList, removeStopwordsList, url): + url=url+"search" data={"q":termLabel, "ontology":targetOntology, "type":"class", "local":True, "fieldList":"label,iri,synonym"} - jsonReply=apiCall(searchURL, data) + jsonReply=apiCall(url, data) termLabel=termLabel.encode(encoding='UTF-8') - #stringProcess(termLabel) - #WE found at least 1 hit try: jsonReply=jsonReply.json()['response'] except: print "Error with deoding jsonReply from OLS api call!" + logging.error("Error with deoding jsonReply from OLS api call!") + print data + print url + logging.error(data) + logging.error(jsonReply) print jsonReply if jsonReply['numFound']>0: @@ -241,12 +234,16 @@ def olsFuzzyMatch(termLabel, targetOntology, replaceTermList, removeStopwordsLis ##Now let's relax The fuzzy search and aim for other (all) ontologies data={"q":termLabel, "type":"class", "local":True, "limit":30} - jsonReply=apiCall(searchURL, data) + jsonReply=apiCall(url, data) try: jsonReply=jsonReply.json()['response'] - except: + except Exception as e: print "Error with decoding jsonReply from RELAXED OLS api call!" print jsonReply + print e + logging.error("Error with decoding jsonReply from RELAXED OLS api call!") + logging.error(jsonReply) + logging.error(e) #jsonReply=jsonReply.json()['response'] @@ -259,14 +256,14 @@ def olsFuzzyMatch(termLabel, targetOntology, replaceTermList, removeStopwordsLis return {"fuzzyTerms": sortedLev, "bridgeTerms": oxoTargetList} #Executes the basic calls, delievers primary score (raw scoring) -def primaryScoreTerm(termIRI, termLabel, targetOntology, scoreParams): +def primaryScoreTerm(termIRI, termLabel, targetOntology, scoreParams, urls): replaceTermList=scoreParams["replaceTermList"] removeStopwordsList=scoreParams["removeStopwordsList"] - olsFuzzyResult=olsFuzzyMatch(termLabel, targetOntology, replaceTermList, removeStopwordsList) + olsFuzzyResult=olsFuzzyMatch(termLabel, targetOntology, replaceTermList, removeStopwordsList, urls["ols"]) if termIRI!='': - oxoResults=oxoMatch(termIRI, targetOntology) + oxoResults=oxoMatch(termIRI, targetOntology, urls["oxo"]) else: oxoResults=[{"curie":"UNKNOWN", "distance": 0}] @@ -279,7 +276,7 @@ def primaryScoreTerm(termIRI, termLabel, targetOntology, scoreParams): bridgeOxo=[] if len(bridgeTerms)>0: for bridgeTerm in bridgeTerms: - tmp=oxoMatch(bridgeTerm['short_form'],targetOntology) + tmp=oxoMatch(bridgeTerm['short_form'],targetOntology, urls["oxo"]) for line in tmp: if line['curie']!='UNKNOWN': bridgeOxo.append(tmp) @@ -396,7 +393,9 @@ def simplifyProcessedPscore(mapping): flag=True if flag==False: - obj={"sourceTerm":mapping['sourceTerm'], "sourceIRI":sourceIRI, "iri":line['bridgeOxoCurie'], "fuzzyScore": 0, "oxoScore": 0, "synFuzzy":0, "synOxo": 0, "bridgeOxoScore": line['oxoScore']} + #obj={"sourceTerm":mapping['sourceTerm'], "sourceIRI":sourceIRI, "iri":line['bridgeOxoCurie'], "fuzzyScore": 0, "oxoScore": 0, "synFuzzy":0, "synOxo": 0, "bridgeOxoScore": line['oxoScore']} + obj={"sourceTerm":mapping['sourceTerm'], "sourceIRI":sourceIRI, "iri":line['oxoCurie'], "fuzzyScore": 0, "oxoScore": 0, "synFuzzy":0, "synOxo": 0, "bridgeOxoScore": line['oxoScore']} + #oxoCurie scoreMatrix.append(obj) @@ -477,7 +476,7 @@ def scoreTermLabel(termLabel, targetOntology, scoreParams, params): # Synonymsearch for comparing Ontologies in OLS, should be called instead score Simple for these cases -def scoreTermOLS(termIRI, termLabel, targetOntology, params): - pscore=primaryScoreTerm(termIRI, termLabel, targetOntology, params) +def scoreTermOLS(termIRI, termLabel, targetOntology, params, urls): + pscore=primaryScoreTerm(termIRI, termLabel, targetOntology, params, urls) pscore['sourceIri']=termIRI return pscore diff --git a/dataloading/paxo/neoExporter.py b/dataloading/paxo/neoExporter.py new file mode 100644 index 0000000..5ff6ff3 --- /dev/null +++ b/dataloading/paxo/neoExporter.py @@ -0,0 +1,165 @@ +import csv +import logging +import flaskMapping +import time +from datetime import datetime + +from neo4j.v1 import GraphDatabase, basic_auth + + +#Load to neo +#uri = "bolt://localhost:7687" + +#uri = "bolt://localhost:7687" +#encrypted=False +#driver = GraphDatabase.driver(uri, auth=basic_auth("neo4j", "dba")) +#driver = GraphDatabase.driver(uri, auth=basic_auth(user, password)) +#session = driver.session() +#print "Loading terms.csv..." +#loadTermsCypher = """USING PERIODIC COMMIT 10000 +# LOAD CSV WITH HEADERS FROM 'file:///Users/tliener/onto_map/oxo/oxo-loading/testspace/terms.csv' AS line +# MATCH (d:Datasource {prefix : line.prefix}) +# WITH d, line +# MERGE (t:Term { id: line.identifier, curie: line.curie, label: line.label, uri: line.uri}) +# with t,d +# CREATE (t)-[:HAS_SOURCE]->(d)""" +#result = session.run(loadTermsCypher) +#print result.summary() +#print "Loading mappings.csv..." +#loadMappingsCypher = """USING PERIODIC COMMIT 10000 +# LOAD CSV WITH HEADERS FROM 'file:///Users/tliener/onto_map/oxo/oxo-loading/test/spacemappings.csv' AS line +# MATCH (f:Term { curie: line.fromCurie}),(t:Term { curie: line.toCurie}) +# WITH f,t,line +# CREATE (f)-[m:MAPPING { sourcePrefix: line.datasourcePrefix, datasource: line.datasource, sourceType: line.sourceType, scope: line.scope, date: line.date}]->(t)""" +#result = session.run(loadMappingsCypher) +#print result.summary() + +#Global: +date=datetime.now().strftime('%Y-%m-%d') + +def writeTermsToNeo(termsFile, session): + loadMappingsCypher = "USING PERIODIC COMMIT 10000 LOAD CSV WITH HEADERS FROM 'file:///"+termsFile+"""' AS line + MATCH (d:Datasource {prefix : line.prefix}) + WITH d, line + MERGE (t:Term { id: line.identifier, curie: line.curie, label: line.label, uri: line.uri}) + with t,d + CREATE (t)-[:HAS_SOURCE]->(d)""" + + print "Try to load terms from "+termsFile+" to database" + print loadMappingsCypher + result = session.run(loadMappingsCypher) + print result.summary() + +def writeMappingsToNeo(mappingsFile, session): + loadMappingsCypher = "USING PERIODIC COMMIT 10000 LOAD CSV WITH HEADERS FROM 'file:///"+mappingsFile+"""' AS line + MATCH (f:Term { curie: line.fromCurie}),(t:Term { curie: line.toCurie}) + WITH f,t,line + CREATE (f)-[m:MAPPING { sourcePrefix: line.datasourcePrefix, datasource: line.datasource, sourceType: line.sourceType, scope: line.scope, date: line.date}]->(t)""" + + print "Try to load mappings from "+mappingsFile+" to database" + print loadMappingsCypher + result = session.run(loadMappingsCypher) + print result.summary() + +def createNode(iri, ontology, olsURL): + data={"q": iri, "ontology":ontology, "exact":True, "type":"class", "local":True, "fieldList":"label,ontology_prefix,obo_id"} + jsonReply=flaskMapping.apiCall(olsURL+"search", data) + + try: + jsonReply=jsonReply.json() + except Exception as e: + print "Error with decoding json reply from OLS API" + print data + print jsonReply + print e + + if len(jsonReply['response']['docs'])>0: + line=[] + label=jsonReply['response']['docs'][0]['label'].encode('utf-8').strip() + ontology_prefix=jsonReply['response']['docs'][0]['ontology_prefix'] + + try: + obo_id=jsonReply['response']['docs'][0]['obo_id'] + except: + print "Try to replace the obo_id with short form!" + try: + #Add Ontology prefix before the short form (e.g. for MESH) + ontoPrefix=jsonReply['response']['docs'][0]['ontology_prefix'] + obo_id=ontoPrefix+":"+jsonReply['response']['docs'][0]['short_form'] + except: + print "Did not work to retrieve the obo_id nor the short_form from OLS. So I use UNKOWN:UNKNOWN instead" + print data + obo_id='UNKNOWN:UNKNOWN' + + + identifier=obo_id.split(':')[1] + + line=list([identifier, obo_id, label, iri,ontology_prefix]) + return line + else: + print "Did not get a doc form OLS for" + print data + print "Therefor this node/mapping is not included in the export!" + return [] + +def createMap(curie1, curie2, score): + line=list([curie1, curie2, 'paxo', 'oxo', 'ALGORITHM', 'PREDICTED', date, score]) + return line + +def exportInNeo(onto1, onto2, predictedFolder, targetFolder, olsURL, neoURL, neoUser, neoPW): + predictedFile=predictedFolder+'calculated_output_'+onto1+"_"+onto2+".csv" + + uri=neoURL + encrypted=False + driver = GraphDatabase.driver(uri, auth=basic_auth(neoUser, neoPW)) + session = driver.session() + + + paxo_term=[] + paxo_mappings=[] + line=list(['identifier','curie', 'label', 'uri', 'prefix']) + paxo_term.append(line) + line=list(['fromCurie','toCurie', 'datasourcePrefix', 'datasource', 'sourceType', 'scope', 'date', 'score']) + paxo_mappings.append(line) + + print "Read in predicte mappings from "+predictedFile + with open(predictedFile) as csvfile: + readCSV = csv.reader(csvfile, delimiter=str(',')) + next(readCSV) + counter=0 + for row in readCSV: + firstRow=createNode(row[0], onto1, olsURL) + secondRow=createNode(row[1], onto2, olsURL) + + if firstRow!=[]: + paxo_term.append(firstRow) + if secondRow!=[]: + paxo_term.append(secondRow) + + if firstRow!=[] and secondRow!=[]: + paxo_mappings.append(createMap(firstRow[1],secondRow[1], row[2])) + + #This is just for Testing, don't take more than 10 + #counter=counter+1 + #if counter>10: + # break + + #print paxo_term + with open(targetFolder+onto1+"_"+onto2+'_termsNeo.csv', 'wb') as f: + writer = csv.writer(f) + writer.writerows(paxo_term) + f.close() + + with open(targetFolder+onto1+"_"+onto2+'_mappingsNeo.csv', 'wb') as f2: + writer = csv.writer(f2) + writer.writerows(paxo_mappings) + f.close() + + writeTermsToNeo(targetFolder+onto1+"_"+onto2+'_termsNeo.csv', session) + writeMappingsToNeo(targetFolder+onto1+"_"+onto2+'_mappingsNeo.csv', session) + + + #After Loading, update solr indexes (Might be done outside of this script so commented out for now) + #print "updating indexes" + #reply = urllib.urlopen(oxoUrl+"/api/search/rebuild?apikey="+apikey) + #print "Finished process!" diff --git a/paxo/readme.md b/dataloading/paxo/readme.md similarity index 100% rename from paxo/readme.md rename to dataloading/paxo/readme.md diff --git a/paxo/requirements.txt b/dataloading/paxo/requirements.txt similarity index 74% rename from paxo/requirements.txt rename to dataloading/paxo/requirements.txt index 90934b7..176414b 100644 --- a/paxo/requirements.txt +++ b/dataloading/paxo/requirements.txt @@ -1,7 +1,8 @@ requests python-levenshtein flask +neo4j-driver spotpy numpy matplotlib -pandas \ No newline at end of file +pandas diff --git a/paxo/validation.py b/dataloading/paxo/validation.py similarity index 83% rename from paxo/validation.py rename to dataloading/paxo/validation.py index def2d79..0c5537f 100644 --- a/paxo/validation.py +++ b/dataloading/paxo/validation.py @@ -2,29 +2,21 @@ import logging import requests import time -from ConfigParser import SafeConfigParser -#url="https://www.ebi.ac.uk/ols/api/search" -#url="http://snarf.ebi.ac.uk:8980/ols-beta/api/search" +#from ConfigParser import SafeConfigParser +#config = SafeConfigParser() +#config.read("config.ini") -config = SafeConfigParser() -config.read("config.ini") - -def validateFinaleScore(onto1, onto2, stdNamed, inputFile, TargetFile, writeToDisc, params, parseParms, validationTargetFolder): +def validateFinaleScore(onto1, onto2, stdNamed, inputFile, TargetFile, writeToDisc, params, parseParms, validationTargetFolder, url): uri1Position=parseParms['uri1'] uri2Position=parseParms['uri2'] counterPosition=parseParms['scorePosition'] delimiterChar=parseParms['delimiter'] - url=config.get("Basics","olsURL") - - print inputFile - print TargetFile - print validationTargetFolder - print uri1Position - print uri2Position + #url=config.get("Basics","olsURL") - logging.basicConfig(filename="flask.log", level=logging.INFO, format='%(asctime)s - %(message)s') + print "Validate ... " + #logging.basicConfig(filename="flask.log", level=logging.INFO, format='%(asctime)s - %(message)s') inputList=[] inputLongList=[] @@ -167,16 +159,16 @@ def validateFinaleScore(onto1, onto2, stdNamed, inputFile, TargetFile, writeToDi logging.info("synFuzzyFactor, "+str(params["synFuzzyFactor"])) logging.info("synOxoFactor, "+str(params["synOxoFactor"])) - logging.info("Stats for "+str(onto1)+"_"+str(onto2)+" validation "+stdNamed) - logging.info("Number of std mappings, "+str(len(targetList))) - logging.info("Total Matches, "+str(len(matches))) - logging.info("Algorithm missed compared to std, "+str(len(missing))) - logging.info("Suspected Obsoleted Terms, "+str(obsoleteScore)) - logging.info("Algorithm missed compared to std MINUS obsoleted terms in std, "+str(len(missing)-obsoleteScore)) - logging.info("Total unique terms suggested, "+str(len(alternatives))) - logging.info("UniqueOverlappingWithMisses, "+str(alternativeCounter)) - logging.info("Recall, "+str((len(matches)/(len(targetList)-obsoleteScore*1.0))*100)+" in %\n") - + msg="Stats for "+str(onto1)+"_"+str(onto2)+" validation "+stdNamed+"\n" + msg=msg+"Number of std mappings, "+str(len(targetList))+"\n" + msg=msg+"Total Matches, "+str(len(matches))+"\n" + msg=msg+"Algorithm missed compared to std, "+str(len(missing))+"\n" + msg=msg+"Suspected Obsoleted Terms, "+str(obsoleteScore)+"\n" + msg=msg+"Algorithm missed compared to std MINUS obsoleted terms in std, "+str(len(missing)-obsoleteScore)+"\n" + msg=msg+"Total unique terms suggested, "+str(len(alternatives))+"\n" + msg=msg+"UniqueOverlappingWithMisses, "+str(alternativeCounter)+"\n" + msg=msg+"Recall, "+str(round(((len(matches)/(len(targetList)-obsoleteScore*1.0))*100),2)) + logging.info(msg) #logging.info("NotMapped: "+str(len(discarted))+"\n") From de5d13f5964ac39e4f1eab1e9520730c3278d7c8 Mon Sep 17 00:00:00 2001 From: LLTommy Date: Wed, 10 Jan 2018 17:00:04 +0000 Subject: [PATCH 10/66] move requirements up since it works for both --- dataloading/{paxo => }/requirements.txt | 3 +++ 1 file changed, 3 insertions(+) rename dataloading/{paxo => }/requirements.txt (78%) diff --git a/dataloading/paxo/requirements.txt b/dataloading/requirements.txt similarity index 78% rename from dataloading/paxo/requirements.txt rename to dataloading/requirements.txt index 176414b..82c2201 100644 --- a/dataloading/paxo/requirements.txt +++ b/dataloading/requirements.txt @@ -2,6 +2,9 @@ requests python-levenshtein flask neo4j-driver +pyyaml +mysql-python + spotpy numpy matplotlib From c8587c2f54fb7aaef1a67e8232fd28a1178c20c6 Mon Sep 17 00:00:00 2001 From: LLTommy Date: Wed, 10 Jan 2018 17:12:06 +0000 Subject: [PATCH 11/66] Adding config file support for OLS loader --- dataloading/oxo/MappingLoader.py | 14 ++++++++++++-- dataloading/oxo/OlsDatasetLoader.py | 10 +++++++--- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/dataloading/oxo/MappingLoader.py b/dataloading/oxo/MappingLoader.py index afda731..de69d3f 100644 --- a/dataloading/oxo/MappingLoader.py +++ b/dataloading/oxo/MappingLoader.py @@ -12,8 +12,18 @@ from ConfigParser import SafeConfigParser -config = SafeConfigParser() -config.read("../config/oxo_dataRelease_config.ini") +#Parse the input parameters. A config file and a flag is expected +if len(sys.argv)!=2: + print "\nNot enough arguments! Please pass a (path) of a config file!" + raise Exception("Not enough arguments! Please pass in a config file!") +else: + config = SafeConfigParser() + config.read(sys.argv[1]) + + + +#config = SafeConfigParser() +#config.read("../config/oxo_dataRelease_config.ini") OXO.oxoUrl=config.get("Basics","oxoUrl") OXO.apikey=config.get("Basics","oxoAPIkey") diff --git a/dataloading/oxo/OlsDatasetLoader.py b/dataloading/oxo/OlsDatasetLoader.py index d73005f..5b117b5 100644 --- a/dataloading/oxo/OlsDatasetLoader.py +++ b/dataloading/oxo/OlsDatasetLoader.py @@ -13,9 +13,13 @@ termToIri = {} termToLabel = {} -#config.read(sys.argv[1]) -config = SafeConfigParser() -config.read("../config/oxo_dataRelease_config.ini") +#Parse the input parameters. A config file and a flag is expected +if len(sys.argv)!=2: + print "\nNot enough arguments! Please pass a (path) of a config file!" + raise Exception("Not enough arguments! Please pass in a config file!") +else: + config = SafeConfigParser() + config.read(sys.argv[1]) OXO.oxoUrl = config.get("Basics","oxoUrl") OXO.apikey = config.get("Basics", "oxoAPIkey") From 1750dd92a9ebac7fd1e24926f1504b21349e51cb Mon Sep 17 00:00:00 2001 From: LLTommy Date: Wed, 10 Jan 2018 18:11:19 +0000 Subject: [PATCH 12/66] Added a missing import --- dataloading/oxo/OlsDatasetLoader.py | 1 + 1 file changed, 1 insertion(+) diff --git a/dataloading/oxo/OlsDatasetLoader.py b/dataloading/oxo/OlsDatasetLoader.py index 5b117b5..81bbafb 100644 --- a/dataloading/oxo/OlsDatasetLoader.py +++ b/dataloading/oxo/OlsDatasetLoader.py @@ -5,6 +5,7 @@ import OxoClient as OXO import csv from ConfigParser import SafeConfigParser +import sys prefixToPreferred = {} idorgNamespace = {} From bcab3026d9eeb4e0b98a53a0db8b1c3f3dd5e6b6 Mon Sep 17 00:00:00 2001 From: LLTommy Date: Wed, 10 Jan 2018 18:14:47 +0000 Subject: [PATCH 13/66] Increase the size of the ols parameter, slipped through, forgot to change is --- dataloading/paxo/clientOperations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataloading/paxo/clientOperations.py b/dataloading/paxo/clientOperations.py index 464fc7e..ba17bba 100644 --- a/dataloading/paxo/clientOperations.py +++ b/dataloading/paxo/clientOperations.py @@ -39,7 +39,7 @@ def scoreOntologies(sourceOntology, targetOntology, scoreParams, scoringtargetFo sourceOntology=targetOntology targetOntology=tmpOntology - termsUrl=olsURL+"ontologies/"+sourceOntology+"/terms?size=10&fieldList=iri,label,synonym" + termsUrl=olsURL+"ontologies/"+sourceOntology+"/terms?size=500&fieldList=iri,label,synonym" results=[] results.append(["sourceLabel","sourceIRI", "fuzzy", "oxo", "synFuzzy", "synOxo", "bridgeTerms"]) From 4f264f888cca5d50629665b06d94c2068c177cd1 Mon Sep 17 00:00:00 2001 From: LLTommy Date: Mon, 15 Jan 2018 12:20:38 +0000 Subject: [PATCH 14/66] Reworking the oxo layout. Not finished but this is an intermediate commit. --- .../main/java/uk/ac/ebi/spot/model/Scope.java | 4 +- .../java/uk/ac/ebi/spot/model/SourceType.java | 1 + oxo-web/src/main/resources/static/css/oxo.css | 18 +- .../resources/static/js/oxo-search-result.js | 21 +- .../src/main/resources/templates/about.html | 249 ++-------- .../main/resources/templates/datasource.html | 338 ++++--------- .../src/main/resources/templates/docs.html | 227 +-------- .../resources/templates/fragments/footer.html | 166 +++++++ .../resources/templates/fragments/head.html | 99 ++++ .../resources/templates/fragments/header.html | 115 +++++ .../src/main/resources/templates/index.html | 410 ++++------------ .../src/main/resources/templates/mapping.html | 57 +-- .../src/main/resources/templates/search.html | 297 ++++-------- .../src/main/resources/templates/terms.html | 456 +++++------------- 14 files changed, 893 insertions(+), 1565 deletions(-) create mode 100644 oxo-web/src/main/resources/templates/fragments/footer.html create mode 100644 oxo-web/src/main/resources/templates/fragments/head.html create mode 100644 oxo-web/src/main/resources/templates/fragments/header.html diff --git a/oxo-model/src/main/java/uk/ac/ebi/spot/model/Scope.java b/oxo-model/src/main/java/uk/ac/ebi/spot/model/Scope.java index f098369..aa8e4a5 100644 --- a/oxo-model/src/main/java/uk/ac/ebi/spot/model/Scope.java +++ b/oxo-model/src/main/java/uk/ac/ebi/spot/model/Scope.java @@ -6,9 +6,9 @@ * Samples, Phenotypes and Ontologies Team, EMBL-EBI */ public enum Scope { - EXACT, NARROW, BROAD, - RELATED + RELATED, + PREDICTED } diff --git a/oxo-model/src/main/java/uk/ac/ebi/spot/model/SourceType.java b/oxo-model/src/main/java/uk/ac/ebi/spot/model/SourceType.java index 4d758d6..1daa169 100644 --- a/oxo-model/src/main/java/uk/ac/ebi/spot/model/SourceType.java +++ b/oxo-model/src/main/java/uk/ac/ebi/spot/model/SourceType.java @@ -8,6 +8,7 @@ public enum SourceType { ONTOLOGY, DATABASE, + ALGORITHM, USER, MANUAL } diff --git a/oxo-web/src/main/resources/static/css/oxo.css b/oxo-web/src/main/resources/static/css/oxo.css index aa063c5..1b651be 100644 --- a/oxo-web/src/main/resources/static/css/oxo.css +++ b/oxo-web/src/main/resources/static/css/oxo.css @@ -3,8 +3,8 @@ padding:2px; padding-right:4px; color: white; - /*font-size: larger;*/ - border-radius: 3px; + /*font-size: larger; + border-radius: 3px;*/ display: inline-block; margin-right: 4px; vertical-align: middle; @@ -48,3 +48,17 @@ cursor: pointer; } +/* Here we introduce new css for the new version*/ +.alert-warning{ + background-color: #faebcc; + margin-bottom: 10px; + padding: 10px 10px 10px 10px; + border-radius: 25px; +} + +.grayBackground{ +background-color: #f2f2f2; +/*border-radius: 25px;*/ +padding: 10px 10px 10px 10px; +margin: 10px; +} \ No newline at end of file diff --git a/oxo-web/src/main/resources/static/js/oxo-search-result.js b/oxo-web/src/main/resources/static/js/oxo-search-result.js index cda3a67..7501237 100644 --- a/oxo-web/src/main/resources/static/js/oxo-search-result.js +++ b/oxo-web/src/main/resources/static/js/oxo-search-result.js @@ -14,6 +14,7 @@ var hideFromCol = false; var apiPath = ''; function initialisePage() { + console.log("In initialise Page") withProgress = $("#example").data("with-progress") ? $("#example").data("with-progress") : true; hideTableInfo = $("#example").data("hide-table-info") ? $("#example").data("hide-table-info"): false; @@ -302,6 +303,8 @@ function getApiPath(element) { } function progressComplete() { + $("#searching_bar").hide(); + $("#searching_bar").hide(); if (withProgress) { $( ".progress-label" ).text( "Complete!" ); } @@ -309,8 +312,15 @@ function progressComplete() { function addProgressBar() { + console.log("In add Progress Bar, but nothing happens here anymore?") + +/* + var progressbar = $( "#progressbar" ), progressLabel = $( ".progress-label" ); + var progressbar = $( "#progressbar" ), + progressLabel = $( ".aria-valuetext" ); + progressbar.progressbar({ value: false, @@ -319,13 +329,16 @@ function addProgressBar() { }, complete: function() { progressLabel.text( "Complete!" ); + + $("#searching_bar").hide(); } - }); + });*/ } function updateProgress(value) { - if (withProgress) { - $("#progressbar").progressbar( "value", value) - } + console.log("In Add progress bar, but that is useless now isn't it? ") + + //if (withProgress) { $("#progressbar").progressbar( "value", value) } + } diff --git a/oxo-web/src/main/resources/templates/about.html b/oxo-web/src/main/resources/templates/about.html index ba042f7..8dbae8f 100644 --- a/oxo-web/src/main/resources/templates/about.html +++ b/oxo-web/src/main/resources/templates/about.html @@ -5,243 +5,54 @@ - - - - - Docs < Ontology Xref Service < EMBL-EBI - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + Ontology Xref Service < EMBL-EBI + - - - - -
- -
- -
- - - EMBL European Bioinformatics Institute - - - -
- -
- - - - -
-

Ontology Xref Service

-
- - - -
- -
- - -
- - -
- -
-
- - -
- - - - -
- -

OxO is an database of ontology cross-references (xrefs) extracted from public ontologies and databases. Most of these cross-references have been extracted from ontologies in the - Ontology Lookup Service by searching for database cross-reference annotations on terms. We have supplemented these cross-references with - mappings from a subset of vocabularies in the UMLS.

- -

- The semantics of a cross-reference are weakly specified, in most cases they mean some kind of operational equivalence, but there is no guarantee. - Sometimes cross-references are used to indicate other types of relationships such as parent/child or that the terms are related in some other way (such as linking a disease concept to a pathway accession that is somehow related to that disease). - OxO aims to provide simple and convenient access to cross-references, but is not a mapping prediction service, so always treat these xrefs with caution, especially if you are seeking true equivalence between two ontologies. -

- -

- OxO gives you access to existing mappings, you can also explore the neighbourhood of a mapping using the distance controller. By default OxO shows you direct asserted mappings, but you can use the slider on various pages to look for mappings that are up to - three hops away. You may see some terms that don't have labels associated to them, we are doing our best to find labels for all of these terms, but sometimes the labels are missing from the sources that we extract mappings from. -

- -

- OxO is developed by the Samples, Phenotypes and Ontologies team. If you have any questions about OxO please contact us. -

- -
- -
- -
-
- - +
+

OxO is an database of ontology cross-references (xrefs) extracted from public ontologies and databases. Most of these cross-references have been extracted from ontologies in the + Ontology Lookup Service by searching for database cross-reference annotations on terms. We have supplemented these cross-references with + mappings from a subset of vocabularies in the UMLS.

-
+

+ The semantics of a cross-reference are weakly specified, in most cases they mean some kind of operational equivalence, but there is no guarantee. + Sometimes cross-references are used to indicate other types of relationships such as parent/child or that the terms are related in some other way (such as linking a disease concept to a pathway accession that is somehow related to that disease). + OxO aims to provide simple and convenient access to cross-references, but is not a mapping prediction service, so always treat these xrefs with caution, especially if you are seeking true equivalence between two ontologies. +

+

+ OxO gives you access to existing mappings, you can also explore the neighbourhood of a mapping using the distance controller. By default OxO shows you direct asserted mappings, but you can use the slider on various pages to look for mappings that are up to + three hops away. You may see some terms that don't have labels associated to them, we are doing our best to find labels for all of these terms, but sometimes the labels are missing from the sources that we extract mappings from. +

+

+ OxO is developed by the Samples, Phenotypes and Ontologies team. If you have any questions about OxO please contact us. +

+ - - - \ No newline at end of file diff --git a/oxo-web/src/main/resources/templates/datasource.html b/oxo-web/src/main/resources/templates/datasource.html index 3453993..6a33415 100644 --- a/oxo-web/src/main/resources/templates/datasource.html +++ b/oxo-web/src/main/resources/templates/datasource.html @@ -4,190 +4,72 @@ - - - + - - - Datasource < Ontology Xref Service < EMBL-EBI - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + Ontology Xref Service < EMBL-EBI + - - - -
-
+ -
+ - - EMBL European Bioinformatics Institute +
+
- + -
- - - - -
-

Ontology Xref Service

-
- - - - -
- -
- - - - + +
+ Success message goes here.
- -
- -
-
- - - - - -
- Success message goes here. -
- - -
- Error message goes here. -
+ +
+ Error message goes here. +
-
+
- + -
-
-

Datasource

+
+
+

Datasource

+
+
+
+ id...
-
-
- id... -
-
- id... -
-
- Version info: id... -
-
- Licence info: id... -
-
- Prefix: id... -
-
+
+ id... +
+
+ Version info: id... +
+
+ Licence info: id... +
+
+ Prefix: id... +
+
@@ -198,133 +80,93 @@

Datasource

th:onclick="'window.location.href=\'' + @{//www.ebi.ac.uk/ols/ontologies/{ontology}(ontology=${datasource.getPrefix()})} + '\''" class="btn btn-default">View in OLS
-
+
-
+
-
-
-

Mappings

-
-
- - -
- - - Mapping Distance: -
- -
-
-
- - -
+
+
+

Mappings

+
+
-
-
- -
+
+ + + Mapping Distance: +
+
-
-
- -
-
- -
- -
+ - - - - - - - - + + + + - \ No newline at end of file + + + diff --git a/oxo-web/src/main/resources/templates/docs.html b/oxo-web/src/main/resources/templates/docs.html index 6835ebf..bc72650 100644 --- a/oxo-web/src/main/resources/templates/docs.html +++ b/oxo-web/src/main/resources/templates/docs.html @@ -5,226 +5,35 @@ - - - - - Docs < Ontology Xref Service < EMBL-EBI - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + Ontology Xref Service < EMBL-EBI + - - - - -
- -
- -
- - - EMBL European Bioinformatics Institute - - - -
- -
- - - - -
-

Ontology Xref Service

-
- - - - -
- -
- - - -
- - -
- -
-
- - -
- - - - -
- -

Root of the REST API is here, docs coming soon....

- -
- -
- -
-
- - - -
+ + - - - + - \ No newline at end of file + + diff --git a/oxo-web/src/main/resources/templates/fragments/footer.html b/oxo-web/src/main/resources/templates/fragments/footer.html new file mode 100644 index 0000000..49b7aa6 --- /dev/null +++ b/oxo-web/src/main/resources/templates/fragments/footer.html @@ -0,0 +1,166 @@ + + + + + + +
+
+ + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + \ No newline at end of file diff --git a/oxo-web/src/main/resources/templates/fragments/head.html b/oxo-web/src/main/resources/templates/fragments/head.html new file mode 100644 index 0000000..1cda266 --- /dev/null +++ b/oxo-web/src/main/resources/templates/fragments/head.html @@ -0,0 +1,99 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/oxo-web/src/main/resources/templates/fragments/header.html b/oxo-web/src/main/resources/templates/fragments/header.html new file mode 100644 index 0000000..f2c38d8 --- /dev/null +++ b/oxo-web/src/main/resources/templates/fragments/header.html @@ -0,0 +1,115 @@ + + + + + + + + + +
+
+
+
+ +
+ + + + + +
+ +
+ + +
+

+ + Ontology Xref Service + +

+
+ + + + + +
+ +
+
+
+
+ + + + \ No newline at end of file diff --git a/oxo-web/src/main/resources/templates/index.html b/oxo-web/src/main/resources/templates/index.html index 5cbac3c..8ce9a92 100644 --- a/oxo-web/src/main/resources/templates/index.html +++ b/oxo-web/src/main/resources/templates/index.html @@ -5,348 +5,152 @@ - - - - - - - Home < Ontology Xref Service < EMBL-EBI - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + Ontology Xref Service < EMBL-EBI + + - - - - - - - - - -
+ -
-
+ - - EMBL European Bioinformatics Institute - - +
+
+ -
- - - - -
-

Ontology Xref Service

-
- - - - -
- -
- - - - +
+

Welcome to the EMBL-EBI Ontology Xref Service (OxO).

+

OxO is a service for finding mappings (or cross-references) between terms from ontologies, vocabularies and coding standards. OxO imports mappings from a variety of sources including the + Ontology Lookup Service and a subset of mappings provided by the UMLS. We're still developing the service so please get in touch if you have any feedback.

+
-
- -
-
- - -
- - - - -
+
-
- - -

Welcome to the EMBL-EBI Ontology Xref Service (OxO).

-

OxO is a service for finding mappings (or cross-references) between terms from ontologies, vocabularies and coding standards. OxO imports mappings from a variety of sources including the - Ontology Lookup Service and a subset of mappings provided by the UMLS. We're still developing the service so please get in touch if you have any feedback.

-
+
+
+

Search OxO by term id

+
+ -
- -
- -
- - - -
- Search OxO by term id - - - - Choose a target (optional) - - - Mapping Distance: - - - - - - - - - - - - - - - - - - - -
- -
- Enter list of identifiers: - - -
- - -
-
- -
+ + + Choose a target (optional) + -
+ -
-
-

OxO summary

-
-
-

- To use OxO either search for mappings using a particular identifier (e.g. MESH:D009202) or select a datasource below to view all mappings between datasources. -

+ -
- -
+ -
-
- -
+ Enter list of identifiers: + + +
+ + +
+ + +
-
+
+
+
+

OxO summary view

+
+
+

+ To use OxO either search for mappings using a particular identifier (e.g. MESH:D009202) or select a datasource below to view all mappings between datasources. +

-
+
+ + +
+
+
- -
+
- - -
-
+ - + + -
+ + - - - - - - - - - - - + - - diff --git a/oxo-web/src/main/resources/templates/mapping.html b/oxo-web/src/main/resources/templates/mapping.html index b4c6964..e80e3b6 100644 --- a/oxo-web/src/main/resources/templates/mapping.html +++ b/oxo-web/src/main/resources/templates/mapping.html @@ -5,48 +5,21 @@ - - - - - Mapping < Ontology Xref Service < EMBL-EBI - - - - - + + Ontology Xref Service < EMBL-EBI + - + + + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +