Skip to content

Commit

Permalink
Sync LOKI 3.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
AndreRico committed Jan 9, 2025
1 parent 126cde7 commit 92fb502
Show file tree
Hide file tree
Showing 40 changed files with 17,738 additions and 8,488 deletions.
14 changes: 12 additions & 2 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,7 +1,17 @@
CHANGELOG

2.4.4 (2024-DEV)
- IN DEVELOPMENT
3.0.1 (2025-DEV)
- In development

3.0.0 (Transition)

2.4.4 (2025-01-08)
- Reorganized code into functional packages
- Developed unit tests
- Added tools to support the project, such as black, phinx, poetry, pytest, and tox
- Migrated setup.py configuration to Poetry, applying all dependencies
- Fixed inconsistencies to ensure compatibility with Python versions 3.10 to 3.12
- Added technical documentation for the project

2.4.3 (2023-09-15)
- updated from Python2 to Python3
Expand Down
4 changes: 4 additions & 0 deletions docs-dev/Status_reports_3_0_1/report_01/activites.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- Close the 2.4.4 version
- Merged development branch to Main branch
- Change development branch to 3.0.1 version
- Comparacao entre o Schema do Biofilter 2.4.4 e 3.0.0 (nao vi diferencas)
101 changes: 101 additions & 0 deletions loki_modules/loaders/loaders_unsupported/loki_source_disgenet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#!/usr/bin/env python

import collections
import re
import apsw
from sh import gunzip
from loki import loki_source


class Source_disgenet(loki_source.Source):


@classmethod
def getVersionString(cls):
return '1.0 (2023-08-08)'
#getVersionString()


def download(self, options):
# download the latest source files
self.downloadFilesFromHTTP('disgenet.org', {
'disgenet_2020.db.gz': '/static/disgenet_ap1/files/sqlite_downloads/current/disgenet_2020.db.gz',
})
#download()


def update(self, options):
# clear out all old data from this source
self.log("deleting old records from the database ...")
self.deleteAll()
self.log(" OK\n")

# get or create the required metadata records
namespaceID = self.addNamespaces([
('disgenet_id', 0),
('entrez_gid', 0),
('disease', 0)
])
typeID = self.addTypes([
('disease',),
('gene',),
])
subtypeID = self.addSubtypes([
('-',),
])

# process disgenet sqlite file
self.log("processing diseases ...")
gunzip('disgenet_2020.db.gz')
diseases = {}
diseaseClass = {}
con = apsw.Connection('disgenet_2020.db')
cur = con.cursor()
comm = 'select diseaseClassNID,diseaseClassName from diseaseClass'
cur.execute(comm)
diseaseClass = {diseaseclass[0]:diseaseclass[1].strip() for diseaseclass in cur.fetchall()}
comm = 'SELECT a.diseaseId,a.diseaseName,b.diseaseClassNID FROM diseaseAttributes a LEFT JOIN disease2class b ON a.diseaseNID=b.diseaseNID order by a.diseaseNID'
cur.execute(comm)
diseases = {disease[0]:[disease[1],disease[2]] for disease in cur.fetchall()}
#foreach line in diseaseFile
self.log(" OK: %d disease\n" % (len(diseases),))

# store diseases
self.log("writing diseases to the database ...")
listSubtype = self.addSubtypes([(val,)for val in set(diseaseClass.values())])
listGroup = diseases.keys()
listAID = self.addTypedGroups(typeID['disease'], ((subtypeID['-'] if diseases[diseaseID][1] is None else listSubtype[diseaseClass[diseases[diseaseID][1]]],diseases[diseaseID][0],None) for diseaseID in listGroup))
groupAID = dict(zip(listGroup,listAID))
self.log(" OK\n")

# store diseases names
self.log("writing diseases names to the database ...")
self.addGroupNamespacedNames(namespaceID['disgenet_id'], ((groupAID[diseaseID],diseaseID) for diseaseID in listGroup))
self.addGroupNamespacedNames(namespaceID['disease'], ((groupAID[diseaseID],diseases[diseaseID][0]) for diseaseID in listGroup))
diseases = None
diseaseClass = None
self.log(" OK\n")

# process disgenet disease identifiers
self.log("processing diseases identifiers ...")
diseaseGene = set()
comm = 'SELECT b.geneId,c.diseaseId FROM geneDiseaseNetwork a LEFT JOIN geneAttributes b ON a.geneNID=b.geneNID LEFT JOIN diseaseAttributes c ON a.diseaseNID=c.diseaseNID ORDER BY c.diseaseId'
cur.execute(comm)
diseaseGeneResult = cur.fetchall()
con.close()
numAssoc = 0
for pair in diseaseGeneResult:
if pair[1] in listGroup:
numAssoc += 1
diseaseGene.add( (groupAID[pair[1]],numAssoc,pair[0]) )
self.log(" OK: %d diseases and gene pairs\n" % (len(diseaseGene),))

# store gaad disease identifiers
self.log("writing diseases and gene pairs to the database ...")
self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID['entrez_gid'], diseaseGene)
diseaseGene = None
self.log(" OK\n")

#update()

#Source_go
148 changes: 148 additions & 0 deletions loki_modules/loaders/loaders_unsupported/loki_source_gaad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#!/usr/bin/env python

import collections
import re
from loki import loki_source


class Source_gaad(loki_source.Source):


@classmethod
def getVersionString(cls):
return '1.0 (2023-06-08)'
#getVersionString()


def download(self, options):
# download the latest source files
self.downloadFilesFromHTTPS('gaad.medgenius.info', {
'diseases2.txt.gz': '/Downloads/diseases2.txt.gz', # disease name by AID
'disease_relationships.txt.gz': '/Downloads/disease_relationships.txt.gz',
'disease_association_database_annotations_uniprot_ncbiGene.txt.gz': '/Downloads/disease_association_database_annotations_uniprot_ncbiGene.txt.gz',
'disease_association_genecards.txt.gz': '/Downloads/disease_association_genecards.txt.gz',
'disease_gene_association_pubmed_textmining_zhao.txt.gz': '/Downloads/disease_gene_association_pubmed_textmining_zhao.txt.gz',
})
#download()


def update(self, options):
# clear out all old data from this source
self.log("deleting old records from the database ...")
self.deleteAll()
self.log(" OK\n")

# get or create the required metadata records
namespaceID = self.addNamespaces([
('gaad_id', 0),
('entrez_gid', 0),
('disease', 0)
])
relationshipID = self.addRelationships([
('disease_co-occurring',),
])
typeID = self.addTypes([
('disease',),
('gene',),
])
subtypeID = self.addSubtypes([
('-',),
])

# process gaad disease
self.log("processing diseases ...")
diseaseFile = self.zfile('diseases2.txt.gz')
diseases = {}
for line in diseaseFile:
if not line.startswith("AID"):
continue
words = line.split("\t")
diseaseID = words[0]
name = words[1].rstrip()
# store disease name of each disease ID (AID)
diseases[diseaseID] = name
#foreach line in diseaseFile
self.log(" OK: %d disease\n" % (len(diseases),))

# store diseases
self.log("writing diseases to the database ...")
listGroup = diseases.keys()
listAID = self.addTypedGroups(typeID['disease'], ((subtypeID['-'],group,diseases[group]) for group in listGroup))
groupAID = dict(zip(listGroup,listAID))
self.log(" OK\n")

# store diseases names
self.log("writing diseases names to the database ...")
self.addGroupNamespacedNames(namespaceID['gaad_id'], ((groupAID[group],group) for group in listGroup))
self.addGroupNamespacedNames(namespaceID['disease'], ((groupAID[group],diseases[group]) for group in listGroup))
diseases = None
self.log(" OK\n")

# process gaad disease relationships
self.log("processing diseases relationships ...")
relationshipFile = self.zfile('disease_relationships.txt.gz')
relationships = []
num = 0
for line in relationshipFile:
if line.startswith("disease_uid1"):
continue
words = line.split("\t")
diseaseID = words[0]
diseaseID2 = words[1]
# store disease pairs that shares genes
relationships.append( (diseaseID,diseaseID2,relationshipID['disease_co-occurring'],None) )
num+=1
#foreach line in diseaseFile
self.log(" OK: %d disease relationships\n" % (num,))

# store gaad disease relationships
self.log("writing diseases relationships to the database ...")
self.addGroupRelationships(relationships)
relationships = None
self.log(" OK\n")

# process gaad disease identifiers
self.log("processing diseases identifiers ...")
ncbiFile = self.zfile('disease_association_database_annotations_uniprot_ncbiGene.txt.gz')
genecardsFile = self.zfile('disease_association_genecards.txt.gz')
pubmedFile = self.zfile('disease_gene_association_pubmed_textmining_zhao.txt.gz')
diseaseGene = []
num = 0
for line in ncbiFile:
if line.startswith("disease_"):
continue
words = line.split("\t")
diseaseID = words[0].strip()
entrezID = words[1].strip()
num+=1
diseaseGene.append((groupAID[diseaseID], num, entrezID))
#foreach line in ncbiFile:
for line in genecardsFile:
if line.startswith("disease_"):
continue
words = line.split("\t")
diseaseID = words[0].strip()
entrezID = words[1].strip()
num+=1
diseaseGene.append((groupAID[diseaseID], num, entrezID))
#foreach line in genecardsFile:
for line in pubmedFile:
if line.startswith("disease_"):
continue
words = line.split("\t")
diseaseID = words[2].strip()
entrezID = words[1].strip()
num+=1
diseaseGene.append((groupAID[diseaseID], num, entrezID))
#foreach line in pubmedFile:
self.log(" OK: %d diseases and gene pairs\n" % (len(diseaseGene),))

# store gaad disease identifiers
self.log("writing diseases and gene pairs to the database ...")
self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID['entrez_gid'], diseaseGene)
diseaseGene = None
self.log(" OK\n")

#update()

#Source_go
Loading

0 comments on commit 92fb502

Please sign in to comment.