diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..349c8e3 --- /dev/null +++ b/.flake8 @@ -0,0 +1,9 @@ +[flake8] +max-line-length = 88 # Black standard +exclude = .git,__pycache__,build,dist # Ignore folders and files +max-complexity = 10 # Set the maximum complexity allowed +ignore = + ; E203, # space before ':' + ; W503, # line break before binary operator + ; W191, # indentation contains tabs + ; E101, # indentation contains mixed spaces and tabs \ No newline at end of file diff --git a/.github/workflows/format.yml b/.github/workflows/format.yml new file mode 100644 index 0000000..65280a5 --- /dev/null +++ b/.github/workflows/format.yml @@ -0,0 +1,31 @@ +name: Code Formatting + +on: + push: + branches: + - development + pull_request: + branches: + - master + - development + +jobs: + black: + runs-on: ubuntu-latest + + steps: + - name: Check out repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: Install black + run: | + pip install black + + - name: Run black check + run: | + black --check . diff --git a/.gitignore b/.gitignore index cebe031..f964de6 100644 --- a/.gitignore +++ b/.gitignore @@ -5,3 +5,4 @@ venv build dist LOKI.egg-info +.vscode \ No newline at end of file diff --git a/CHANGELOG b/CHANGELOG index 4896bec..5e17202 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,6 +1,10 @@ CHANGELOG -3.0.0 +3.0.1-dev +- + + +3.0.0-dev (2024-10-03) - added disease category for groups - added sub-categories for groups - added data source like disgenet, gaad, and kegg disease diff --git a/__init__.py b/__init__.py index 8a42c50..ad9077a 100644 --- a/__init__.py +++ b/__init__.py @@ -1,2 +1,9 @@ -__all__ = ["loki-build","loki/loki_db","loki/loki_source","loki/loki_updater","loki/loaders","loki/util"] -__version__ = "3.0.0" \ No newline at end of file +__all__ = [ + "loki-build", + "loki/loki_db", + "loki/loki_source", + "loki/loki_updater", + "loki/loaders", + "loki/util", +] +__version__ = "3.0.0" diff --git a/loki-build.py b/loki-build.py index 4451b46..d5138a2 100644 --- a/loki-build.py +++ b/loki-build.py @@ -69,244 +69,337 @@ from loki import loki_db - if __name__ == "__main__": - version = "LOKI version %s" % (loki_db.Database.getVersionString()) - - # define arguments - parser = argparse.ArgumentParser( - formatter_class=argparse.RawDescriptionHelpFormatter, - description=version, - ) - parser.add_argument('--version', action='version', - version=version+"\n%s version %s\n%s version %s" % ( - loki_db.Database.getDatabaseDriverName(), loki_db.Database.getDatabaseDriverVersion(), - loki_db.Database.getDatabaseInterfaceName(), loki_db.Database.getDatabaseInterfaceVersion() - ) - ) - parser.add_argument('-k', '--knowledge', type=str, metavar='file', action='store', default=None, - help="the knowledge database file to use" - ) - parser.add_argument('-a', '--archive', type=str, metavar='file', action='store', default=None, - help="create (or re-use and update) a compressed archive of downloaded source data files" - ) - parser.add_argument('--from-archive', type=str, metavar='file', action='store', default=None, - help="an input source data archive to re-use but not update" - ) - parser.add_argument('--to-archive', type=str, metavar='file', action='store', default=None, - help="an output source data archive to create (or replace) but not re-use" - ) - parser.add_argument('-d', '--temp-directory', type=str, metavar='dir', action='store', default=None, - help="a directory to use for temporary storage of downloaded or archived source data files (default: platform dependent)" - ) -# parser.add_argument('-m', '--memory', type=str, metavar='size', default=None, #TODO -# help="the target amount of system memory to use (not exact, allow some margin); default: ~1gb" -# ) - parser.add_argument('-l', '--list-sources', type=str, metavar='source', nargs='*', action='append', default=None, - help="list versions and options for the specified source loaders, or if none or '+' are specified, list all available sources" - ) - parser.add_argument('-c', '--cache-only', action='store_true', - help="do not download any new source data files, only use what's available in the provided archive" - ) - parser.add_argument('-u', '--update', type=str, metavar='source', nargs='*', action='append', default=None, - help="update the knowledge database file by downloading and processing new data from the specified sources, " - +"or if none or '+' are specified, from all available sources" - ) - parser.add_argument('-U', '--update-except', type=str, metavar='source', nargs='*', action='append', default=None, - help="update the knowledge database file by downloading and processing new data from all available sources EXCEPT those specified" - ) - parser.add_argument('-o', '--option', type=str, metavar=('source','optionstring'), nargs=2, action='append', default=None, - help="additional option(s) to pass to the specified source loader module, in the format 'option=value[,option2=value2[,...]]'" - ) # e.g. --option dbsnp roles=yes - parser.add_argument('-r', '--force-update', action='store_true', - help="update all sources even if their source data has not changed since the last update" - ) - parser.add_argument('-f', '--finalize', action='store_true', - help="finalize the knowledge database file" - ) - parser.add_argument('--no-optimize', action='store_true', - help="do not optimize the knowledge database file after updating" - ) - parser.add_argument('-v', '--verbose', action='store_true', - help="print warnings and log messages (default)" - ) - parser.add_argument('-q', '--quiet', action='store_true', - help="suppress warnings and log messages" - ) - parser.add_argument('-t', '--test-data', action='store_true', - help="Load testing data only" - ) - - # if no arguments, print usage and exit - if len(sys.argv) < 2: - print (version) - print - parser.print_usage() - print - print ("Use -h for details.") - sys.exit(2) - - # parse arguments - args = parser.parse_args() - -# # parse memory allotment, if any -# memLimit = 64*1024*1024 # default 64mb for sqlite (+ ~1gb for updater itself) -# if args.memory: -# m = args.memory.upper() -# if m.endswith('B'): -# m = m[:-1] -# if m.endswith('T'): -# m = float(m[:-1]) * 1024 * 1024 * 1024 * 1024 -# elif m.endswith('G'): -# m = float(m[:-1]) * 1024 * 1024 * 1024 -# elif m.endswith('M'): -# m = float(m[:-1]) * 1024 * 1024 -# elif m.endswith('K'): -# m = float(m[:-1]) * 1024 -# else: -# m = float(m) -# m = long(m) -# if m < 1024*1024*1024: -# print "WARNING: ignoring '%s' memory allotment, the updater requires ~1gb at minimum" % args.memory -# else: -# print "using ~%1.1fMB of memory" % (m / (1024 * 1024)) -# memLimit = max(memLimit, m - 1024*1024*1024) -# #if args.memory - - # set $TMPDIR so sqlite will use it for vacuum etc. - if args.temp_directory: - if not os.path.isdir(args.temp_directory): - print ("ERROR: '%s' is not a directory") - sys.exit(1) - os.environ['TMPDIR'] = os.path.abspath(args.temp_directory) - - # instantiate database object - db = loki_db.Database(testing=args.test_data, updating=True) - db.setVerbose(args.verbose or (not args.quiet)) - db.attachDatabaseFile(args.knowledge) - - # list sources? - if args.list_sources != None: - srcSet = set() - for srcList in args.list_sources: - srcSet |= set(srcList) - if (not srcSet) or ('+' in srcSet): - print ("available source loaders:") - srcSet = set() - else: - print ("source loader options:") - moduleVersions = db.getSourceModuleVersions(srcSet) - moduleOptions = db.getSourceModuleOptions(srcSet) - for srcName in sorted(moduleOptions.keys()): - print (" %s : %s" % (srcName,moduleVersions[srcName])) - if moduleOptions[srcName]: - for srcOption in sorted(moduleOptions[srcName].keys()): - print (" %s = %s" % (srcOption,moduleOptions[srcName][srcOption])) - elif srcSet: - print (" ") - - # pass options? - userOptions = {} - if args.option != None: - for optList in args.option: - srcName = optList[0] - if srcName not in userOptions: - userOptions[srcName] = {} - for optString in optList[1].split(','): - opt,val = optString.split('=',1) - userOptions[srcName][opt] = val - userOptions = userOptions or None - - # parse requested update sources - srcSet = None - if args.update != None: - srcSet = set() - for srcList in args.update: - srcSet |= set(srcList) - notSet = None - if args.update_except != None: - notSet = set() - for srcList in args.update_except: - notSet |= set(srcList) - - # update? - updateOK = True - if (srcSet != None) or (notSet != None): - db.testDatabaseWriteable() - if db.getDatabaseSetting('finalized',int): - print ("ERROR: cannot update a finalized database") - sys.exit(1) - if srcSet and '+' in srcSet: - srcSet = set() - srcSet = (srcSet or set(db.getSourceModules())) - (notSet or set()) - - # create temp directory and unpack input archive, if any - startDir = os.getcwd() - fromArchive = args.from_archive or args.archive - toArchive = args.to_archive or args.archive - cacheDir = os.path.abspath(tempfile.mkdtemp(prefix='loki_update_cache.', dir=args.temp_directory)) - if args.temp_directory: - print ("using temporary directory '%s'" % cacheDir) - - # try/finally to make sure we clean up the cache dir at the end - try: - if fromArchive: - if os.path.exists(fromArchive) and tarfile.is_tarfile(fromArchive): - print ("unpacking archived source data files from '%s' ..." % fromArchive) - with tarfile.open(name=fromArchive, mode='r:*') as archive: - archive.errorlevel = 2 - # the archive should only contain directories named after sources, - # so we can filter members by their normalized top-level directory - for member in archive: - srcName = posixpath.normpath(member.name).split('/',1)[0] - if (not srcName) or srcName.startswith('.'): - continue - # if we're not writing an output archive, we only have to extract - # the directories for the sources we need - if (not toArchive) and (srcName not in srcSet): - continue - archive.extractall(cacheDir, [member]) - #with archive - print ("... OK") - else: - print ("source data archive '%s' not found, starting fresh" % fromArchive) - #if fromArchive - - os.chdir(cacheDir) - updateOK = db.updateDatabase(srcSet, userOptions, args.cache_only, args.force_update) - os.chdir(startDir) - - # create output archive, if requested - if toArchive and not args.cache_only: - print ("archiving source data files in '%s' ..." % toArchive) - with tarfile.open(name=toArchive, mode='w:gz') as archive: - archive.errorlevel = 2 - for filename in sorted(os.listdir(cacheDir)): - archive.add(os.path.join(cacheDir, filename), arcname=filename) - print ("... OK") - finally: - # clean up cache directory - def rmtree_error(func, path, exc): - print ("WARNING: unable to remove temporary file '%s': %s\n" % (path,exc)) - shutil.rmtree(cacheDir, onerror=rmtree_error) - #update - - if args.knowledge: - # finalize? - if args.finalize and (not db.getDatabaseSetting('finalized',int)): - if not updateOK: - print ("WARNING: errors encountered during knowledge database update; skipping finalization step") - else: - db.testDatabaseWriteable() - db.finalizeDatabase() - - # optimize? - if (not args.no_optimize) and (not db.getDatabaseSetting('optimized',int)): - if not updateOK: - print ("WARNING: errors encountered during knowledge database update; skipping optimization step") - else: - db.testDatabaseWriteable() - db.optimizeDatabase() - #if knowledge -#__main__ + version = "LOKI version %s" % (loki_db.Database.getVersionString()) + + # define arguments + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=version, + ) + parser.add_argument( + "--version", + action="version", + version=version + + "\n%s version %s\n%s version %s" + % ( + loki_db.Database.getDatabaseDriverName(), + loki_db.Database.getDatabaseDriverVersion(), + loki_db.Database.getDatabaseInterfaceName(), + loki_db.Database.getDatabaseInterfaceVersion(), + ), + ) + parser.add_argument( + "-k", + "--knowledge", + type=str, + metavar="file", + action="store", + default=None, + help="the knowledge database file to use", + ) + parser.add_argument( + "-a", + "--archive", + type=str, + metavar="file", + action="store", + default=None, + help="create (or re-use and update) a compressed archive of downloaded source data files", + ) + parser.add_argument( + "--from-archive", + type=str, + metavar="file", + action="store", + default=None, + help="an input source data archive to re-use but not update", + ) + parser.add_argument( + "--to-archive", + type=str, + metavar="file", + action="store", + default=None, + help="an output source data archive to create (or replace) but not re-use", + ) + parser.add_argument( + "-d", + "--temp-directory", + type=str, + metavar="dir", + action="store", + default=None, + help="a directory to use for temporary storage of downloaded or archived source data files (default: platform dependent)", + ) + # parser.add_argument('-m', '--memory', type=str, metavar='size', default=None, #TODO + # help="the target amount of system memory to use (not exact, allow some margin); default: ~1gb" + # ) + parser.add_argument( + "-l", + "--list-sources", + type=str, + metavar="source", + nargs="*", + action="append", + default=None, + help="list versions and options for the specified source loaders, or if none or '+' are specified, list all available sources", + ) + parser.add_argument( + "-c", + "--cache-only", + action="store_true", + help="do not download any new source data files, only use what's available in the provided archive", + ) + parser.add_argument( + "-u", + "--update", + type=str, + metavar="source", + nargs="*", + action="append", + default=None, + help="update the knowledge database file by downloading and processing new data from the specified sources, " + + "or if none or '+' are specified, from all available sources", + ) + parser.add_argument( + "-U", + "--update-except", + type=str, + metavar="source", + nargs="*", + action="append", + default=None, + help="update the knowledge database file by downloading and processing new data from all available sources EXCEPT those specified", + ) + parser.add_argument( + "-o", + "--option", + type=str, + metavar=("source", "optionstring"), + nargs=2, + action="append", + default=None, + help="additional option(s) to pass to the specified source loader module, in the format 'option=value[,option2=value2[,...]]'", + ) # e.g. --option dbsnp roles=yes + parser.add_argument( + "-r", + "--force-update", + action="store_true", + help="update all sources even if their source data has not changed since the last update", + ) + parser.add_argument( + "-f", + "--finalize", + action="store_true", + help="finalize the knowledge database file", + ) + parser.add_argument( + "--no-optimize", + action="store_true", + help="do not optimize the knowledge database file after updating", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="print warnings and log messages (default)", + ) + parser.add_argument( + "-q", "--quiet", action="store_true", help="suppress warnings and log messages" + ) + parser.add_argument( + "-t", "--test-data", action="store_true", help="Load testing data only" + ) + + # if no arguments, print usage and exit + if len(sys.argv) < 2: + print(version) + print() + parser.print_usage() + print() + print("Use -h for details.") + sys.exit(2) + + # parse arguments + args = parser.parse_args() + + # # parse memory allotment, if any + # memLimit = 64*1024*1024 # default 64mb for sqlite (+ ~1gb for updater itself) + # if args.memory: + # m = args.memory.upper() + # if m.endswith('B'): + # m = m[:-1] + # if m.endswith('T'): + # m = float(m[:-1]) * 1024 * 1024 * 1024 * 1024 + # elif m.endswith('G'): + # m = float(m[:-1]) * 1024 * 1024 * 1024 + # elif m.endswith('M'): + # m = float(m[:-1]) * 1024 * 1024 + # elif m.endswith('K'): + # m = float(m[:-1]) * 1024 + # else: + # m = float(m) + # m = long(m) + # if m < 1024*1024*1024: + # print "WARNING: ignoring '%s' memory allotment, the updater requires ~1gb at minimum" % args.memory + # else: + # print "using ~%1.1fMB of memory" % (m / (1024 * 1024)) + # memLimit = max(memLimit, m - 1024*1024*1024) + # #if args.memory + + # set $TMPDIR so sqlite will use it for vacuum etc. + if args.temp_directory: + if not os.path.isdir(args.temp_directory): + print("ERROR: '%s' is not a directory") + sys.exit(1) + os.environ["TMPDIR"] = os.path.abspath(args.temp_directory) + + # instantiate database object + db = loki_db.Database(testing=args.test_data, updating=True) + db.setVerbose(args.verbose or (not args.quiet)) + db.attachDatabaseFile(args.knowledge) + + # list sources? + if args.list_sources != None: + srcSet = set() + for srcList in args.list_sources: + srcSet |= set(srcList) + if (not srcSet) or ("+" in srcSet): + print("available source loaders:") + srcSet = set() + else: + print("source loader options:") + moduleVersions = db.getSourceModuleVersions(srcSet) + moduleOptions = db.getSourceModuleOptions(srcSet) + for srcName in sorted(moduleOptions.keys()): + print(" %s : %s" % (srcName, moduleVersions[srcName])) + if moduleOptions[srcName]: + for srcOption in sorted(moduleOptions[srcName].keys()): + print( + " %s = %s" % (srcOption, moduleOptions[srcName][srcOption]) + ) + elif srcSet: + print(" ") + + # pass options? + userOptions = {} + if args.option != None: + for optList in args.option: + srcName = optList[0] + if srcName not in userOptions: + userOptions[srcName] = {} + for optString in optList[1].split(","): + opt, val = optString.split("=", 1) + userOptions[srcName][opt] = val + userOptions = userOptions or None + + # parse requested update sources + srcSet = None + if args.update != None: + srcSet = set() + for srcList in args.update: + srcSet |= set(srcList) + notSet = None + if args.update_except != None: + notSet = set() + for srcList in args.update_except: + notSet |= set(srcList) + + # update? + updateOK = True + if (srcSet != None) or (notSet != None): + db.testDatabaseWriteable() + if db.getDatabaseSetting("finalized", int): + print("ERROR: cannot update a finalized database") + sys.exit(1) + if srcSet and "+" in srcSet: + srcSet = set() + srcSet = (srcSet or set(db.getSourceModules())) - (notSet or set()) + + # create temp directory and unpack input archive, if any + startDir = os.getcwd() + fromArchive = args.from_archive or args.archive + toArchive = args.to_archive or args.archive + cacheDir = os.path.abspath( + tempfile.mkdtemp(prefix="loki_update_cache.", dir=args.temp_directory) + ) + if args.temp_directory: + print("using temporary directory '%s'" % cacheDir) + + # try/finally to make sure we clean up the cache dir at the end + try: + if fromArchive: + if os.path.exists(fromArchive) and tarfile.is_tarfile(fromArchive): + print( + "unpacking archived source data files from '%s' ..." + % fromArchive + ) + with tarfile.open(name=fromArchive, mode="r:*") as archive: + archive.errorlevel = 2 + # the archive should only contain directories named after sources, + # so we can filter members by their normalized top-level directory + for member in archive: + srcName = posixpath.normpath(member.name).split("/", 1)[0] + if (not srcName) or srcName.startswith("."): + continue + # if we're not writing an output archive, we only have to extract + # the directories for the sources we need + if (not toArchive) and (srcName not in srcSet): + continue + archive.extractall(cacheDir, [member]) + # with archive + print("... OK") + else: + print( + "source data archive '%s' not found, starting fresh" + % fromArchive + ) + # if fromArchive + + os.chdir(cacheDir) + updateOK = db.updateDatabase( + srcSet, userOptions, args.cache_only, args.force_update + ) + os.chdir(startDir) + + # create output archive, if requested + if toArchive and not args.cache_only: + print("archiving source data files in '%s' ..." % toArchive) + with tarfile.open(name=toArchive, mode="w:gz") as archive: + archive.errorlevel = 2 + for filename in sorted(os.listdir(cacheDir)): + archive.add(os.path.join(cacheDir, filename), arcname=filename) + print("... OK") + finally: + # clean up cache directory + def rmtree_error(func, path, exc): + print( + "WARNING: unable to remove temporary file '%s': %s\n" % (path, exc) + ) + + shutil.rmtree(cacheDir, onerror=rmtree_error) + # update + + if args.knowledge: + # finalize? + if args.finalize and (not db.getDatabaseSetting("finalized", int)): + if not updateOK: + print( + "WARNING: errors encountered during knowledge database update; skipping finalization step" + ) + else: + db.testDatabaseWriteable() + db.finalizeDatabase() + # optimize? + if (not args.no_optimize) and (not db.getDatabaseSetting("optimized", int)): + if not updateOK: + print( + "WARNING: errors encountered during knowledge database update; skipping optimization step" + ) + else: + db.testDatabaseWriteable() + db.optimizeDatabase() + # if knowledge +# __main__ diff --git a/loki/__init__.py b/loki/__init__.py index 398ceac..01db6c4 100644 --- a/loki/__init__.py +++ b/loki/__init__.py @@ -1 +1 @@ -__all__ = ["loki_db","loki_source","loki_updater","loaders","util"] +__all__ = ["loki_db", "loki_source", "loki_updater", "loaders", "util"] diff --git a/loki/loaders/loki_source_biogrid.py b/loki/loaders/loki_source_biogrid.py index 93482bf..0cef60e 100644 --- a/loki/loaders/loki_source_biogrid.py +++ b/loki/loaders/loki_source_biogrid.py @@ -5,142 +5,171 @@ class Source_biogrid(loki_source.Source): - - - @classmethod - def getVersionString(cls): - return '2.1 (2022-04-13)' - #getVersionString() - - - def download(self, options, path): - # download the latest source files - self.downloadFilesFromHTTP('downloads.thebiogrid.org', { - path+'/BIOGRID-ORGANISM-LATEST.tab2.zip': '/Download/BioGRID/Latest-Release/BIOGRID-ORGANISM-LATEST.tab2.zip', - }) - - return [ path+'/BIOGRID-ORGANISM-LATEST.tab2.zip'] - #download() - - - def update(self, options, path): - # clear out all old data from this source - self.log("deleting old records from the database ...\n") - self.deleteAll() - self.log("deleting old records from the database completed\n") - - # get or create the required metadata records - namespaceID = self.addNamespaces([ - ('biogrid_id', 0), - ('symbol', 0), - ('entrez_gid', 0), - ]) - typeID = self.addTypes([ - ('interaction',), - ('gene',), - ]) - subtypeID = self.addSubtypes([ - ('-',), - ]) - - # process associations - self.log("verifying archive file ...\n") - pairLabels = dict() - empty = tuple() - with zipfile.ZipFile(path+'/BIOGRID-ORGANISM-LATEST.tab2.zip','r') as assocZip: - err = assocZip.testzip() - if err: - self.log(" ERROR\n") - self.log("CRC failed for %s\n" % err) - return False - self.log("verifying archive file completed\n") - self.log("processing gene interactions ...\n") - for info in assocZip.infolist(): - if info.filename.find('Homo_sapiens') >= 0: - assocFile = assocZip.open(info,'r') - header = assocFile.__next__().rstrip() - observedHeaders = { - "#BioGRID Interaction ID\tEntrez Gene Interactor A\tEntrez Gene Interactor B\tBioGRID ID Interactor A\tBioGRID ID Interactor B\tSystematic Name Interactor A\tSystematic Name Interactor B\tOfficial Symbol Interactor A\tOfficial Symbol Interactor B\tSynonymns Interactor A\tSynonyms Interactor B\tExperimental System\tExperimental System Type\tAuthor\tPubmed ID\tOrganism Interactor A\tOrganism Interactor B", # "\tThroughput\tScore\tModification\tPhenotypes\tQualifications\tTags\tSource Database", - "#BioGRID Interaction ID\tEntrez Gene Interactor A\tEntrez Gene Interactor B\tBioGRID ID Interactor A\tBioGRID ID Interactor B\tSystematic Name Interactor A\tSystematic Name Interactor B\tOfficial Symbol Interactor A\tOfficial Symbol Interactor B\tSynonyms Interactor A\tSynonyms Interactor B\tExperimental System\tExperimental System Type\tAuthor\tPubmed ID\tOrganism Interactor A\tOrganism Interactor B", # "\tThroughput\tScore\tModification\tPhenotypes\tQualifications\tTags\tSource Database", - } - if not max(header.decode().startswith(obsHdr) for obsHdr in observedHeaders): - self.log(" ERROR\n") - self.log("unrecognized file header in '%s': %s\n" % (info.filename,header)) - return False - for line in assocFile: - line = line.decode() - words = line.split('\t') - if(words[1] == "-" or words[2] == "-"): - continue - bgID = int(words[0]) - entrezID1 = int(words[1]) - entrezID2 = int(words[2]) - syst1 = words[5] if words[5] != "-" else None - syst2 = words[6] if words[6] != "-" else None - gene1 = words[7] - gene2 = words[8] - aliases1 = words[9].split('|') if words[9] != "-" else empty - aliases2 = words[10].split('|') if words[10] != "-" else empty - tax1 = words[15] - tax2 = words[16] - - if tax1 == '9606' and tax2 == '9606': - member1 = (entrezID1, gene1, syst1) + tuple(aliases1) - member2 = (entrezID2, gene2, syst2) + tuple(aliases2) - if member1 != member2: - pair = (member1,member2) - if pair not in pairLabels: - pairLabels[pair] = set() - pairLabels[pair].add(bgID) - #if interaction is ok - #foreach line in assocFile - assocFile.close() - #if Homo_sapiens file - #foreach file in assocZip - #with assocZip - numAssoc = len(pairLabels) - numGene = len(set(pair[0] for pair in pairLabels) | set(pair[1] for pair in pairLabels)) - numName = sum(len(pairLabels[pair]) for pair in pairLabels) - self.log("processing gene interactions completed: %d interactions (%d genes), %d pair identifiers\n" % (numAssoc,numGene,numName)) - - # store interaction groups - self.log("writing interaction pairs to the database ...\n") - listPair = pairLabels.keys() - listGID = self.addTypedGroups(typeID['interaction'], ((subtypeID['-'], "biogrid:%s" % min(pairLabels[pair]), "") for pair in listPair)) - pairGID = dict(zip(listPair,listGID)) - self.log("writing interaction pairs to the database completed\n") - - # store interaction labels - listLabels = [] - for pair in listPair: - listLabels.extend( (pairGID[pair],label) for label in pairLabels[pair] ) - self.log("writing interaction names to the database ...\n") - self.addGroupNamespacedNames(namespaceID['biogrid_id'], listLabels) - self.log("writing interaction names to the database completed\n") - - # store gene interactions - self.log("writing gene interactions to the database ...\n") - nsAssoc = { - 'symbol': set(), - 'entrez_gid': set(), - } - numAssoc = 0 - for pair in pairLabels: - numAssoc += 1 - nsAssoc['entrez_gid'].add( (pairGID[pair],numAssoc,pair[0][0]) ) - for n in range(1,len(pair[0])): - nsAssoc['symbol'].add( (pairGID[pair],numAssoc,pair[0][n]) ) - - numAssoc += 1 - nsAssoc['entrez_gid'].add( (pairGID[pair],numAssoc,pair[1][0]) ) - for n in range(1,len(pair[1])): - nsAssoc['symbol'].add( (pairGID[pair],numAssoc,pair[1][n]) ) - for ns in nsAssoc: - self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID[ns], nsAssoc[ns]) - self.log("writing gene interactions to the database completed\n") - - # TODO: decide if there's any value in trying to identify pseudo-pathways - """ + + @classmethod + def getVersionString(cls): + return "2.1 (2022-04-13)" + + # getVersionString() + + def download(self, options, path): + # download the latest source files + self.downloadFilesFromHTTP( + "downloads.thebiogrid.org", + { + path + + "/BIOGRID-ORGANISM-LATEST.tab2.zip": "/Download/BioGRID/Latest-Release/BIOGRID-ORGANISM-LATEST.tab2.zip", + }, + ) + + return [path + "/BIOGRID-ORGANISM-LATEST.tab2.zip"] + + # download() + + def update(self, options, path): + # clear out all old data from this source + self.log("deleting old records from the database ...\n") + self.deleteAll() + self.log("deleting old records from the database completed\n") + + # get or create the required metadata records + namespaceID = self.addNamespaces( + [ + ("biogrid_id", 0), + ("symbol", 0), + ("entrez_gid", 0), + ] + ) + typeID = self.addTypes( + [ + ("interaction",), + ("gene",), + ] + ) + subtypeID = self.addSubtypes( + [ + ("-",), + ] + ) + + # process associations + self.log("verifying archive file ...\n") + pairLabels = dict() + empty = tuple() + with zipfile.ZipFile( + path + "/BIOGRID-ORGANISM-LATEST.tab2.zip", "r" + ) as assocZip: + err = assocZip.testzip() + if err: + self.log(" ERROR\n") + self.log("CRC failed for %s\n" % err) + return False + self.log("verifying archive file completed\n") + self.log("processing gene interactions ...\n") + for info in assocZip.infolist(): + if info.filename.find("Homo_sapiens") >= 0: + assocFile = assocZip.open(info, "r") + header = assocFile.__next__().rstrip() + observedHeaders = { + "#BioGRID Interaction ID\tEntrez Gene Interactor A\tEntrez Gene Interactor B\tBioGRID ID Interactor A\tBioGRID ID Interactor B\tSystematic Name Interactor A\tSystematic Name Interactor B\tOfficial Symbol Interactor A\tOfficial Symbol Interactor B\tSynonymns Interactor A\tSynonyms Interactor B\tExperimental System\tExperimental System Type\tAuthor\tPubmed ID\tOrganism Interactor A\tOrganism Interactor B", # "\tThroughput\tScore\tModification\tPhenotypes\tQualifications\tTags\tSource Database", + "#BioGRID Interaction ID\tEntrez Gene Interactor A\tEntrez Gene Interactor B\tBioGRID ID Interactor A\tBioGRID ID Interactor B\tSystematic Name Interactor A\tSystematic Name Interactor B\tOfficial Symbol Interactor A\tOfficial Symbol Interactor B\tSynonyms Interactor A\tSynonyms Interactor B\tExperimental System\tExperimental System Type\tAuthor\tPubmed ID\tOrganism Interactor A\tOrganism Interactor B", # "\tThroughput\tScore\tModification\tPhenotypes\tQualifications\tTags\tSource Database", + } + if not max( + header.decode().startswith(obsHdr) for obsHdr in observedHeaders + ): + self.log(" ERROR\n") + self.log( + "unrecognized file header in '%s': %s\n" + % (info.filename, header) + ) + return False + for line in assocFile: + line = line.decode() + words = line.split("\t") + if words[1] == "-" or words[2] == "-": + continue + bgID = int(words[0]) + entrezID1 = int(words[1]) + entrezID2 = int(words[2]) + syst1 = words[5] if words[5] != "-" else None + syst2 = words[6] if words[6] != "-" else None + gene1 = words[7] + gene2 = words[8] + aliases1 = words[9].split("|") if words[9] != "-" else empty + aliases2 = words[10].split("|") if words[10] != "-" else empty + tax1 = words[15] + tax2 = words[16] + + if tax1 == "9606" and tax2 == "9606": + member1 = (entrezID1, gene1, syst1) + tuple(aliases1) + member2 = (entrezID2, gene2, syst2) + tuple(aliases2) + if member1 != member2: + pair = (member1, member2) + if pair not in pairLabels: + pairLabels[pair] = set() + pairLabels[pair].add(bgID) + # if interaction is ok + # foreach line in assocFile + assocFile.close() + # if Homo_sapiens file + # foreach file in assocZip + # with assocZip + numAssoc = len(pairLabels) + numGene = len( + set(pair[0] for pair in pairLabels) | set(pair[1] for pair in pairLabels) + ) + numName = sum(len(pairLabels[pair]) for pair in pairLabels) + self.log( + "processing gene interactions completed: %d interactions (%d genes), %d pair identifiers\n" + % (numAssoc, numGene, numName) + ) + + # store interaction groups + self.log("writing interaction pairs to the database ...\n") + listPair = pairLabels.keys() + listGID = self.addTypedGroups( + typeID["interaction"], + ( + (subtypeID["-"], "biogrid:%s" % min(pairLabels[pair]), "") + for pair in listPair + ), + ) + pairGID = dict(zip(listPair, listGID)) + self.log("writing interaction pairs to the database completed\n") + + # store interaction labels + listLabels = [] + for pair in listPair: + listLabels.extend((pairGID[pair], label) for label in pairLabels[pair]) + self.log("writing interaction names to the database ...\n") + self.addGroupNamespacedNames(namespaceID["biogrid_id"], listLabels) + self.log("writing interaction names to the database completed\n") + + # store gene interactions + self.log("writing gene interactions to the database ...\n") + nsAssoc = { + "symbol": set(), + "entrez_gid": set(), + } + numAssoc = 0 + for pair in pairLabels: + numAssoc += 1 + nsAssoc["entrez_gid"].add((pairGID[pair], numAssoc, pair[0][0])) + for n in range(1, len(pair[0])): + nsAssoc["symbol"].add((pairGID[pair], numAssoc, pair[0][n])) + + numAssoc += 1 + nsAssoc["entrez_gid"].add((pairGID[pair], numAssoc, pair[1][0])) + for n in range(1, len(pair[1])): + nsAssoc["symbol"].add((pairGID[pair], numAssoc, pair[1][n])) + for ns in nsAssoc: + self.addGroupMemberTypedNamespacedNames( + typeID["gene"], namespaceID[ns], nsAssoc[ns] + ) + self.log("writing gene interactions to the database completed\n") + + # TODO: decide if there's any value in trying to identify pseudo-pathways + """ self.log("identifying implied networks ...") geneAssoc = dict() for pair in listPair: @@ -156,7 +185,8 @@ def update(self, options, path): numGroup = len(listPath) self.log(" OK: %d associations (%d genes in %d groups)\n" % (numAssoc,numGene,numGroup)) """ - #update() - - -#Source_biogrid + + # update() + + +# Source_biogrid diff --git a/loki/loaders/loki_source_chainfiles.py b/loki/loaders/loki_source_chainfiles.py index bc0760c..fe49efe 100644 --- a/loki/loaders/loki_source_chainfiles.py +++ b/loki/loaders/loki_source_chainfiles.py @@ -9,180 +9,198 @@ class Source_chainfiles(loki_source.Source): - """ - A loader that loads all of the chainfiles into LOKI - """ - - - ################################################## - # private class data - - -# _reDir = re.compile('^hg[0-9]+$', re.IGNORECASE) - _reFile = re.compile(r'^hg([0-9]+)tohg([0-9]+)\.over\.chain\.gz$', re.IGNORECASE) - _reFileName = r'hg([0-9]+)ToHg([0-9]+)\.over\.chain\.gz' - - _reNum = ('4', '10', '11', '12', '13', '15', '16', '17', '18', '19', '38' ) - - ################################################## - # source interface - - - @classmethod - def getVersionString(cls): - return '2.2 (2014-06-27)' - #getVersionString() - - - def download(self, options, path): - # define a callback to search for all available hgX liftover chain files -# def remFilesCallback(ftp): -# remFiles = {} -# ftp.cwd('/goldenPath') -# for d in [ d for d in ftp.nlst() if self._reDir.match(d) ]: -# ftp.cwd('/goldenPath/%s' % d) -# if 'liftOver' in ftp.nlst(): -# ftp.cwd('/goldenPath/%s/liftOver' % d) -# for f in [ f for f in ftp.nlst() if self._reFile.match(f) ]: -# remFiles[f] = '/goldenPath/%s/liftOver/%s' % (d,f) -# return remFiles - #remFilesCallback - - remFiles = {} - for i in self._reNum: - urlpath = urllib2.urlopen('http://hgdownload.cse.ucsc.edu/goldenPath/hg%s/liftOver' % i) - string = urlpath.read().decode('utf-8') - onlyfiles = list(set(re.findall(self._reFileName, string))) - for j in onlyfiles: - if i == j[0]: - filenames = 'hg'+j[0]+'ToHg'+j[1]+'.over.chain.gz' - remFiles[path+'/'+filenames] = '/goldenPath/hg'+i+'/liftOver/'+filenames -# self.downloadFilesFromFTP("hgdownload.cse.ucsc.edu", remFilesCallback) - self.downloadFilesFromHTTP('hgdownload.cse.ucsc.edu', remFiles) - - return list(remFiles.keys()) - #download() - - - def update(self, options, path): - """ - Parse all of the chain files and insert them into the database - """ - - # clear out all old data from this source - self.log("deleting old records from the database ...\n") - self.deleteAll() - self.log("deleting old records from the database completed\n") - - for fn in os.listdir(path): - match = self._reFile.match(fn) - if not match: - continue - old_ucschg = int(match.group(1)) - new_ucschg = int(match.group(2)) - self.log("parsing chains for hg%d -> hg%d ...\n" % (old_ucschg,new_ucschg)) - f = self.zfile(path+'/'+fn) - - is_hdr = True - is_valid = True - chain_hdrs = [] - chain_data = [] - curr_data = [] - for line in f: - if is_hdr: - if line: - try: - chain_hdrs.append(self._parseChain(line)) - except: - is_valid = False - is_hdr = False - elif line: - if is_valid: - curr_data.append(line) - else: - if is_valid: - chain_data.append(self._parseData(chain_hdrs[-1], '\n'.join(curr_data))) - is_valid = True - curr_data = [] - is_hdr = True - - hdr_ids = self.addChains(old_ucschg, new_ucschg, chain_hdrs) - - # Now, I want to take my list of IDs and my list of list of - # tuples and convert them into a list of tuples suitable for - # entering in the chain_data table - chain_id_data = zip(hdr_ids, chain_data) - chain_data_itr = (tuple(itertools.chain((chn[0],),seg)) for chn in chain_id_data for seg in chn[1]) - - self.addChainData(chain_data_itr) - - self.log("parsing chains completed\n") - # for fn in dir - - #update() - - def _parseChain(self, chain_hdr): - """ - Parses the chain header to extract the information required - for insertion into the database. - UCSC chain files use 0-based half-open intervals according to: - https://genome.ucsc.edu/goldenPath/help/chain.html - Since LOKI uses 1-based closed intervals, we add 1 to start positions. - """ - - # get the 1st line - hdr = chain_hdr.strip().split('\n')[0].strip() - - # Parse the first line - # "chain" score oldChr oldSize oldDir oldStart oldEnd newChr newSize newDir newStart newEnd id - wds = hdr.split() - - if wds[0] != "chain": - raise Exception("Not a valid chain file") - - if wds[2][3:] not in self._loki.chr_num: - raise Exception("Could not find chromosome: " + wds[2][3:] + "->" + wds[7][3:]) - - is_fwd = (wds[9] == "+") - if is_fwd: - new_start = int(wds[10]) + 1 - new_end = int(wds[11]) - else: - # NOTE: If we're going backward, this will mean that - # end < start - new_start = int(wds[8]) - int(wds[10]) - new_end = int(wds[8]) - int(wds[11]) + 1 - - - # I want a tuple of (score, old_chr, old_start, old_end, - # new_chr, new_start, new_end, is_forward) - return (int(wds[1]), - self._loki.chr_num[wds[2][3:]], int(wds[5]) + 1, int(wds[6]), - self._loki.chr_num.get(wds[7][3:],-1), new_start, new_end, - int(is_fwd)) - - def _parseData(self, chain_tuple, chain_data): - """ - Parses the chain data into a more readily usable and iterable - form (the data of the chain is everything after the 1st line) - """ - _data = [ tuple([int(v) for v in l.split()]) for l in chain_data.split('\n')[:-1] ] - - curr_pos = chain_tuple[2] - new_pos = chain_tuple[5] - - _data_txform = [] - for l in _data: - _data_txform.append((curr_pos, curr_pos + l[0] - 1, new_pos)) - curr_pos = curr_pos + l[0] + l[1] - if chain_tuple[7]: - new_pos = new_pos + l[0] + l[2] - else: - new_pos = new_pos - l[0] - l[2] - - _data_txform.append((curr_pos, curr_pos + int(chain_data.split()[-1]) - 1, new_pos)) - - return _data_txform - -#class Source_chainfiles - + """ + A loader that loads all of the chainfiles into LOKI + """ + + ################################################## + # private class data + + # _reDir = re.compile('^hg[0-9]+$', re.IGNORECASE) + _reFile = re.compile(r"^hg([0-9]+)tohg([0-9]+)\.over\.chain\.gz$", re.IGNORECASE) + _reFileName = r"hg([0-9]+)ToHg([0-9]+)\.over\.chain\.gz" + + _reNum = ("4", "10", "11", "12", "13", "15", "16", "17", "18", "19", "38") + + ################################################## + # source interface + + @classmethod + def getVersionString(cls): + return "2.2 (2014-06-27)" + + # getVersionString() + + def download(self, options, path): + # define a callback to search for all available hgX liftover chain files + # def remFilesCallback(ftp): + # remFiles = {} + # ftp.cwd('/goldenPath') + # for d in [ d for d in ftp.nlst() if self._reDir.match(d) ]: + # ftp.cwd('/goldenPath/%s' % d) + # if 'liftOver' in ftp.nlst(): + # ftp.cwd('/goldenPath/%s/liftOver' % d) + # for f in [ f for f in ftp.nlst() if self._reFile.match(f) ]: + # remFiles[f] = '/goldenPath/%s/liftOver/%s' % (d,f) + # return remFiles + # remFilesCallback + + remFiles = {} + for i in self._reNum: + urlpath = urllib2.urlopen( + "http://hgdownload.cse.ucsc.edu/goldenPath/hg%s/liftOver" % i + ) + string = urlpath.read().decode("utf-8") + onlyfiles = list(set(re.findall(self._reFileName, string))) + for j in onlyfiles: + if i == j[0]: + filenames = "hg" + j[0] + "ToHg" + j[1] + ".over.chain.gz" + remFiles[path + "/" + filenames] = ( + "/goldenPath/hg" + i + "/liftOver/" + filenames + ) + # self.downloadFilesFromFTP("hgdownload.cse.ucsc.edu", remFilesCallback) + self.downloadFilesFromHTTP("hgdownload.cse.ucsc.edu", remFiles) + + return list(remFiles.keys()) + + # download() + + def update(self, options, path): + """ + Parse all of the chain files and insert them into the database + """ + + # clear out all old data from this source + self.log("deleting old records from the database ...\n") + self.deleteAll() + self.log("deleting old records from the database completed\n") + + for fn in os.listdir(path): + match = self._reFile.match(fn) + if not match: + continue + old_ucschg = int(match.group(1)) + new_ucschg = int(match.group(2)) + self.log("parsing chains for hg%d -> hg%d ...\n" % (old_ucschg, new_ucschg)) + f = self.zfile(path + "/" + fn) + + is_hdr = True + is_valid = True + chain_hdrs = [] + chain_data = [] + curr_data = [] + for line in f: + if is_hdr: + if line: + try: + chain_hdrs.append(self._parseChain(line)) + except: + is_valid = False + is_hdr = False + elif line: + if is_valid: + curr_data.append(line) + else: + if is_valid: + chain_data.append( + self._parseData(chain_hdrs[-1], "\n".join(curr_data)) + ) + is_valid = True + curr_data = [] + is_hdr = True + + hdr_ids = self.addChains(old_ucschg, new_ucschg, chain_hdrs) + + # Now, I want to take my list of IDs and my list of list of + # tuples and convert them into a list of tuples suitable for + # entering in the chain_data table + chain_id_data = zip(hdr_ids, chain_data) + chain_data_itr = ( + tuple(itertools.chain((chn[0],), seg)) + for chn in chain_id_data + for seg in chn[1] + ) + + self.addChainData(chain_data_itr) + + self.log("parsing chains completed\n") + # for fn in dir + + # update() + + def _parseChain(self, chain_hdr): + """ + Parses the chain header to extract the information required + for insertion into the database. + UCSC chain files use 0-based half-open intervals according to: + https://genome.ucsc.edu/goldenPath/help/chain.html + Since LOKI uses 1-based closed intervals, we add 1 to start positions. + """ + + # get the 1st line + hdr = chain_hdr.strip().split("\n")[0].strip() + + # Parse the first line + # "chain" score oldChr oldSize oldDir oldStart oldEnd newChr newSize newDir newStart newEnd id + wds = hdr.split() + + if wds[0] != "chain": + raise Exception("Not a valid chain file") + + if wds[2][3:] not in self._loki.chr_num: + raise Exception( + "Could not find chromosome: " + wds[2][3:] + "->" + wds[7][3:] + ) + + is_fwd = wds[9] == "+" + if is_fwd: + new_start = int(wds[10]) + 1 + new_end = int(wds[11]) + else: + # NOTE: If we're going backward, this will mean that + # end < start + new_start = int(wds[8]) - int(wds[10]) + new_end = int(wds[8]) - int(wds[11]) + 1 + + # I want a tuple of (score, old_chr, old_start, old_end, + # new_chr, new_start, new_end, is_forward) + return ( + int(wds[1]), + self._loki.chr_num[wds[2][3:]], + int(wds[5]) + 1, + int(wds[6]), + self._loki.chr_num.get(wds[7][3:], -1), + new_start, + new_end, + int(is_fwd), + ) + + def _parseData(self, chain_tuple, chain_data): + """ + Parses the chain data into a more readily usable and iterable + form (the data of the chain is everything after the 1st line) + """ + _data = [ + tuple([int(v) for v in l.split()]) for l in chain_data.split("\n")[:-1] + ] + + curr_pos = chain_tuple[2] + new_pos = chain_tuple[5] + + _data_txform = [] + for l in _data: + _data_txform.append((curr_pos, curr_pos + l[0] - 1, new_pos)) + curr_pos = curr_pos + l[0] + l[1] + if chain_tuple[7]: + new_pos = new_pos + l[0] + l[2] + else: + new_pos = new_pos - l[0] - l[2] + + _data_txform.append( + (curr_pos, curr_pos + int(chain_data.split()[-1]) - 1, new_pos) + ) + + return _data_txform + + +# class Source_chainfiles diff --git a/loki/loaders/loki_source_dbsnp.py b/loki/loaders/loki_source_dbsnp.py index 44db06b..947afbf 100644 --- a/loki/loaders/loki_source_dbsnp.py +++ b/loki/loaders/loki_source_dbsnp.py @@ -8,212 +8,262 @@ class Source_dbsnp(loki_source.Source): - - - ################################################## - # private class data - - - _chmList = ('1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','X','Y','PAR','MT') - _grcBuild = None - - ################################################## - # private class data - - - def _identifyLatestSNPContig(self, filenames): -# reFile = re.compile(r'^b([0-9]+)_SNPContigLocusId(.*)\.bcp\.gz$', re.IGNORECASE) - bestbuild = 0 - bestfile = list() - for filename in filenames: - #foreach file in path - if int(filename[0]) > bestbuild: - bestfile.append(filename[0]) - bestfile.append(filename[1].split(".bcp.gz")[0]) - bestbuild = int(filename[0]) - return bestfile - #_identifyLatestSNPContig() - - - ################################################## - # source interface - - - @classmethod - def getVersionString(cls): - return '2.3 (2018-11-01)' - #getVersionString() - - - @classmethod - def getOptions(cls): - return { - 'unvalidated' : '[yes|no] -- store SNP loci which have not been validated (default: yes)', - 'suspect' : '[yes|no] -- store SNP loci which are suspect (default: no)', # http://www.ncbi.nlm.nih.gov/projects/SNP/docs/rs_attributes.html#suspect - 'withdrawn' : '[yes|no] -- store SNP loci which have been withdrawn (default: no)', - 'loci' : '[all|validated] -- store all or only validated SNP loci (default: validat`dddded)', - 'merges' : '[yes|no] -- process and store RS# merge history (default: yes)', - 'roles' : '[yes|no] -- process and store SNP roles (default: no)', - } - #getOptions() - - - def validateOptions(self, options): - options.setdefault('unvalidated', 'yes') - options.setdefault('suspect', 'no') - options.setdefault('withdrawn', 'no') - options.setdefault('merges', 'yes') - options.setdefault('roles', 'no') - for o,v in options.items(): - v = v.strip().lower() - if o in ('unvalidated','suspect','withdrawn','merges','roles'): - if 'yes'.startswith(v): - v = 'yes' - elif 'no'.startswith(v): - v = 'no' - else: - return "%s must be 'yes' or 'no'" % o - else: - return "unknown option '%s'" % o - options[o] = v - return True - #validateOptions() - - - def download(self, options, path): - # define a callback to identify the latest SNPContigLocusId file - def remFilesCallback(ftp, path): - remFiles = dict() - for chm in self._chmList: - remFiles[path+'/chr_'+chm+'.txt.gz'] = '/snp/organisms/human_9606/chr_rpts/chr_%s.txt.gz' % chm - - if options['merges'] == 'yes': - remFiles[path+'/RsMergeArch.bcp.gz'] = '/snp/organisms/human_9606/database/organism_data/RsMergeArch.bcp.gz' - - if options.get['roles'] == 'yes': - remFiles[path+'/SnpFunctionCode.bcp.gz'] = '/snp/organisms/database/shared_data/SnpFunctionCode.bcp.gz' - urlpath = '/snp/organisms/human_9606/database/organism_data' - ftp.cwd(urlpath) - bestfile = self._identifyLatestSNPContig(ftp.nlst()) - - if bestfile: - remFiles[bestfile] = '%s/%s' % (urlpath,bestfile) - - return remFiles - #remFilesCallback - - remFiles = dict() - for chm in self._chmList: - remFiles[path+'/chr_%s.txt.gz' % chm] = '/snp/organisms/human_9606/chr_rpts/chr_%s.txt.gz' % chm - if options['merges'] == 'yes': - remFiles[path+'/RsMergeArch.bcp.gz'] = '/snp/organisms/human_9606/database/organism_data/RsMergeArch.bcp.gz' - if options['roles'] == 'yes': - remFiles[path+'/SnpFunctionCode.bcp.gz'] = '/snp/organisms/database/shared_data/SnpFunctionCode.bcp.gz' - urlfolderpath = '/snp/organisms/human_9606/database/organism_data' - urlpath = urllib2.urlopen('https://ftp.ncbi.nih.gov' + urlfolderpath) - string = urlpath.read().decode('utf-8') - onlyfiles = list(set(re.findall(r'b([0-9]+)_SNPContigLocusId_(.*)\.bcp\.gz', string))) - bestfile = self._identifyLatestSNPContig(onlyfiles) - bestfilename = 'b'+bestfile[0]+'_SNPContigLocusId_'+bestfile[1]+'.bcp.gz' - if bestfile: - remFiles[path+'/'+bestfilename] = '%s/%s' % (urlfolderpath,bestfilename) - - # download the latest source files -# self.downloadFilesFromFTP('ftp.ncbi.nih.gov', remFilesCallback) - self.downloadFilesFromHTTP('ftp.ncbi.nih.gov', remFiles) - - return list(remFiles.keys()) - #download() - - - def update(self, options, path): - # clear out all old data from this source - self.log("deleting old records from the database ...\n") - self.deleteAll() - self.log("deleting old records from the database completed\n") - - # process merge report (no header!) - if options.get('merges','yes') == 'yes': - """ /* from human_9606_table.sql.gz */ -CREATE TABLE [RsMergeArch] -( -[rsHigh] [int] NULL , -[rsLow] [int] NULL , -[build_id] [int] NULL , -[orien] [tinyint] NULL , -[create_time] [datetime] NOT NULL , -[last_updated_time] [datetime] NOT NULL , -[rsCurrent] [int] NULL , -[orien2Current] [tinyint] NULL , -[comment] [varchar](255) NULL -) -""" - self.log("processing SNP merge records ...\n") - mergeFile = self.zfile(path+'/RsMergeArch.bcp.gz') #TODO:context manager,iterator - numMerge = 0 - setMerge = set() - for line in mergeFile: - words = line.split("\t") - if not (len(words) > 6 and words[0] and words[6]): - continue - rsOld = int(words[0]) - #rsNew = int(words[1]) - rsCur = int(words[6]) - - setMerge.add( (rsOld,rsCur) ) - - # write to the database after each 2.5 million, to keep memory usage down - if len(setMerge) >= 2500000: - numMerge += len(setMerge) - self.log("processing SNP merge records: ~%1.1f million so far\n" % (numMerge/1000000.0)) #TODO: time estimate - self.log("writing SNP merge records to the database ...\n") - self.addSNPMerges(setMerge) - setMerge = set() - self.log("writing SNP merge records to the database completed\n") - #foreach line in mergeFile - numMerge += len(setMerge) - self.log("processing SNP merge records completed: ~%d merged RS#s\n" % numMerge) - if setMerge: - self.log("writing SNP merge records to the database ...\n") - self.addSNPMerges(setMerge) - self.log("writing SNP merge records to the database completed\n") - setMerge = None - #if merges - - # process SNP role function codes - if options.get('roles','no') == 'yes': - """ /* from dbSNP_main_table.sql.gz */ -CREATE TABLE [SnpFunctionCode] -( -[code] [tinyint] NOT NULL , -[abbrev] [varchar](20) NOT NULL , -[descrip] [varchar](255) NOT NULL , -[create_time] [smalldatetime] NOT NULL , -[top_level_class] [char](5) NOT NULL , -[is_coding] [tinyint] NOT NULL , -[is_exon] [bit] NULL , -[var_prop_effect_code] [int] NULL , -[var_prop_gene_loc_code] [int] NULL , -[SO_id] [varchar](32) NULL -) -""" - self.log("processing SNP role codes ...\n") - roleID = dict() - codeFile = self.zfile(path+'/SnpFunctionCode.bcp.gz') - for line in codeFile: - words = line.split('\t') - code = int(words[0]) - name = words[1] - desc = words[2] - coding = int(words[5]) if (len(words) > 5 and words[5] != '') else None - exon = int(words[6]) if (len(words) > 6 and words[6] != '') else None - - roleID[code] = self.addRole(name, desc, coding, exon) - #foreach line in codeFile - self.log("processing SNP role codes completed: %d codes\n" % len(roleID)) - - # process SNP roles - """ /* from human_9606_table.sql.gz */ + + ################################################## + # private class data + + _chmList = ( + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "X", + "Y", + "PAR", + "MT", + ) + _grcBuild = None + + ################################################## + # private class data + + def _identifyLatestSNPContig(self, filenames): + # reFile = re.compile(r'^b([0-9]+)_SNPContigLocusId(.*)\.bcp\.gz$', re.IGNORECASE) + bestbuild = 0 + bestfile = list() + for filename in filenames: + # foreach file in path + if int(filename[0]) > bestbuild: + bestfile.append(filename[0]) + bestfile.append(filename[1].split(".bcp.gz")[0]) + bestbuild = int(filename[0]) + return bestfile + + # _identifyLatestSNPContig() + + ################################################## + # source interface + + @classmethod + def getVersionString(cls): + return "2.3 (2018-11-01)" + + # getVersionString() + + @classmethod + def getOptions(cls): + return { + "unvalidated": "[yes|no] -- store SNP loci which have not been validated (default: yes)", + "suspect": "[yes|no] -- store SNP loci which are suspect (default: no)", # http://www.ncbi.nlm.nih.gov/projects/SNP/docs/rs_attributes.html#suspect + "withdrawn": "[yes|no] -- store SNP loci which have been withdrawn (default: no)", + "loci": "[all|validated] -- store all or only validated SNP loci (default: validat`dddded)", + "merges": "[yes|no] -- process and store RS# merge history (default: yes)", + "roles": "[yes|no] -- process and store SNP roles (default: no)", + } + + # getOptions() + + def validateOptions(self, options): + options.setdefault("unvalidated", "yes") + options.setdefault("suspect", "no") + options.setdefault("withdrawn", "no") + options.setdefault("merges", "yes") + options.setdefault("roles", "no") + for o, v in options.items(): + v = v.strip().lower() + if o in ("unvalidated", "suspect", "withdrawn", "merges", "roles"): + if "yes".startswith(v): + v = "yes" + elif "no".startswith(v): + v = "no" + else: + return "%s must be 'yes' or 'no'" % o + else: + return "unknown option '%s'" % o + options[o] = v + return True + + # validateOptions() + + def download(self, options, path): + # define a callback to identify the latest SNPContigLocusId file + def remFilesCallback(ftp, path): + remFiles = dict() + for chm in self._chmList: + remFiles[path + "/chr_" + chm + ".txt.gz"] = ( + "/snp/organisms/human_9606/chr_rpts/chr_%s.txt.gz" % chm + ) + + if options["merges"] == "yes": + remFiles[path + "/RsMergeArch.bcp.gz"] = ( + "/snp/organisms/human_9606/database/organism_data/RsMergeArch.bcp.gz" + ) + + if options.get["roles"] == "yes": + remFiles[path + "/SnpFunctionCode.bcp.gz"] = ( + "/snp/organisms/database/shared_data/SnpFunctionCode.bcp.gz" + ) + urlpath = "/snp/organisms/human_9606/database/organism_data" + ftp.cwd(urlpath) + bestfile = self._identifyLatestSNPContig(ftp.nlst()) + + if bestfile: + remFiles[bestfile] = "%s/%s" % (urlpath, bestfile) + + return remFiles + + # remFilesCallback + + remFiles = dict() + for chm in self._chmList: + remFiles[path + "/chr_%s.txt.gz" % chm] = ( + "/snp/organisms/human_9606/chr_rpts/chr_%s.txt.gz" % chm + ) + if options["merges"] == "yes": + remFiles[path + "/RsMergeArch.bcp.gz"] = ( + "/snp/organisms/human_9606/database/organism_data/RsMergeArch.bcp.gz" + ) + if options["roles"] == "yes": + remFiles[path + "/SnpFunctionCode.bcp.gz"] = ( + "/snp/organisms/database/shared_data/SnpFunctionCode.bcp.gz" + ) + urlfolderpath = "/snp/organisms/human_9606/database/organism_data" + urlpath = urllib2.urlopen("https://ftp.ncbi.nih.gov" + urlfolderpath) + string = urlpath.read().decode("utf-8") + onlyfiles = list( + set(re.findall(r"b([0-9]+)_SNPContigLocusId_(.*)\.bcp\.gz", string)) + ) + bestfile = self._identifyLatestSNPContig(onlyfiles) + bestfilename = ( + "b" + bestfile[0] + "_SNPContigLocusId_" + bestfile[1] + ".bcp.gz" + ) + if bestfile: + remFiles[path + "/" + bestfilename] = "%s/%s" % ( + urlfolderpath, + bestfilename, + ) + + # download the latest source files + # self.downloadFilesFromFTP('ftp.ncbi.nih.gov', remFilesCallback) + self.downloadFilesFromHTTP("ftp.ncbi.nih.gov", remFiles) + + return list(remFiles.keys()) + + # download() + + def update(self, options, path): + # clear out all old data from this source + self.log("deleting old records from the database ...\n") + self.deleteAll() + self.log("deleting old records from the database completed\n") + + # process merge report (no header!) + if options.get("merges", "yes") == "yes": + """/* from human_9606_table.sql.gz */ + CREATE TABLE [RsMergeArch] + ( + [rsHigh] [int] NULL , + [rsLow] [int] NULL , + [build_id] [int] NULL , + [orien] [tinyint] NULL , + [create_time] [datetime] NOT NULL , + [last_updated_time] [datetime] NOT NULL , + [rsCurrent] [int] NULL , + [orien2Current] [tinyint] NULL , + [comment] [varchar](255) NULL + ) + """ + self.log("processing SNP merge records ...\n") + mergeFile = self.zfile( + path + "/RsMergeArch.bcp.gz" + ) # TODO:context manager,iterator + numMerge = 0 + setMerge = set() + for line in mergeFile: + words = line.split("\t") + if not (len(words) > 6 and words[0] and words[6]): + continue + rsOld = int(words[0]) + # rsNew = int(words[1]) + rsCur = int(words[6]) + + setMerge.add((rsOld, rsCur)) + + # write to the database after each 2.5 million, to keep memory usage down + if len(setMerge) >= 2500000: + numMerge += len(setMerge) + self.log( + "processing SNP merge records: ~%1.1f million so far\n" + % (numMerge / 1000000.0) + ) # TODO: time estimate + self.log("writing SNP merge records to the database ...\n") + self.addSNPMerges(setMerge) + setMerge = set() + self.log("writing SNP merge records to the database completed\n") + # foreach line in mergeFile + numMerge += len(setMerge) + self.log( + "processing SNP merge records completed: ~%d merged RS#s\n" % numMerge + ) + if setMerge: + self.log("writing SNP merge records to the database ...\n") + self.addSNPMerges(setMerge) + self.log("writing SNP merge records to the database completed\n") + setMerge = None + # if merges + + # process SNP role function codes + if options.get("roles", "no") == "yes": + """/* from dbSNP_main_table.sql.gz */ + CREATE TABLE [SnpFunctionCode] + ( + [code] [tinyint] NOT NULL , + [abbrev] [varchar](20) NOT NULL , + [descrip] [varchar](255) NOT NULL , + [create_time] [smalldatetime] NOT NULL , + [top_level_class] [char](5) NOT NULL , + [is_coding] [tinyint] NOT NULL , + [is_exon] [bit] NULL , + [var_prop_effect_code] [int] NULL , + [var_prop_gene_loc_code] [int] NULL , + [SO_id] [varchar](32) NULL + ) + """ + self.log("processing SNP role codes ...\n") + roleID = dict() + codeFile = self.zfile(path + "/SnpFunctionCode.bcp.gz") + for line in codeFile: + words = line.split("\t") + code = int(words[0]) + name = words[1] + desc = words[2] + coding = int(words[5]) if (len(words) > 5 and words[5] != "") else None + exon = int(words[6]) if (len(words) > 6 and words[6] != "") else None + + roleID[code] = self.addRole(name, desc, coding, exon) + # foreach line in codeFile + self.log("processing SNP role codes completed: %d codes\n" % len(roleID)) + + # process SNP roles + """ /* from human_9606_table.sql.gz */ CREATE TABLE [b137_SNPContigLocusId] ( [snp_id] [int] NULL , @@ -246,168 +296,223 @@ def update(self, options, path): [verComp] [int] NULL ) """ - self.log("processing SNP roles ...\n") - setRole = set() - numRole = numOrphan = numInc = 0 - setOrphan = set() - funcFile = self.zfile(list(filter(re.compile(r'b([0-9]+)_SNPContigLocusId_(.*)\.bcp\.gz').match, os.listdir(path)))[0]) - for line in funcFile: - words = list(w.strip() for w in line.split("\t")) - rs = int(words[0]) if words[0] else None - entrez = int(words[5]) if words[5] else None - #genesymbol = words[6] - code = int(words[11]) if words[11] else None - - if rs and entrez and code: - try: - setRole.add( (rs,entrez,roleID[code]) ) - except KeyError: - setOrphan.add(code) - numOrphan += 1 - else: - numInc += 1 - - # write to the database after each 2.5 million, to keep memory usage down - if len(setRole) >= 2500000: - numRole += len(setRole) - self.log("processing SNP roles: ~%1.1f million so far\n" % (numRole/1000000.0)) #TODO: time estimate - self.log("writing SNP roles to the database ...\n") - self.addSNPEntrezRoles(setRole) - setRole = set() - self.log("writing SNP roles to the database completed\n") - - roleID = None - #foreach line in funcFile - numRole += len(setRole) - self.log("processing SNP roles completed: ~%d roles\n" % (numRole,)) - if setRole: - self.log("writing SNP roles to the database ...\n") - self.addSNPEntrezRoles(setRole) - self.log("writing SNP roles to the database completed\n") - setRole = None - - # warn about orphans - if setOrphan: - self.log("WARNING: %d roles (%d codes) unrecognized\n" % (numOrphan,len(setOrphan))) - if numInc: - self.log("WARNING: %d roles incomplete\n" % (numInc,)) - setOrphan = None - #if roles - - # process chromosome report files - # dbSNP chromosome reports use 1-based coordinates since b125, according to: - # http://www.ncbi.nlm.nih.gov/books/NBK44414/#Reports.the_xml_dump_for_build_126_has_a - # This matches LOKI's convention. - reBuild = re.compile('GRCh([0-9]+)') - includeUnvalidated = (options['unvalidated'] == 'yes') - includeSuspect = (options['suspect'] == 'yes') - includeWithdrawn = (options['withdrawn'] == 'yes') - processChmThreads = {} - for fileChm in self._chmList: - processChmThreads[fileChm] = Thread(target=self.processChmSNPs, args=(fileChm, reBuild, includeUnvalidated, includeSuspect, includeWithdrawn, path)) - processChmThreads[fileChm].start() - #foreach chromosome - for fileChm in self._chmList: - processChmThreads[fileChm].join() - - # store source metadata - self.setSourceBuilds(self._grcBuild, None) - #update() - - def processChmSNPs(self, fileChm, reBuild, includeUnvalidated, includeSuspect, includeWithdrawn, path): - self.log("processing chromosome %s SNPs ...\n" % fileChm) - chmFile = self.zfile(path+'/chr_'+fileChm+'.txt.gz') - - # verify file headers - header1 = chmFile.__next__().rstrip() - chmFile.__next__() - chmFile.__next__() - header2 = chmFile.__next__().rstrip() - header3 = chmFile.__next__().rstrip() - chmFile.__next__() - chmFile.__next__() - if not header1.startswith("dbSNP Chromosome Report"): - raise Exception("ERROR: unrecognized file header '%s'" % header1) - if not header2.startswith("rs#\tmap\tsnp\tchr\tctg\ttotal\tchr\tctg\tctg\tctg\tctg\tchr\tlocal\tavg\ts.e.\tmax\tvali-\tgeno-\tlink\torig\tupd"): - raise Exception("ERROR: unrecognized file subheader '%s'" % header2) - if not header3.startswith("\twgt\ttype\thits\thits\thits\t\tacc\tver\tID\tpos\tpos\tloci\thet\thet\tprob\tdated\ttypes\touts\tbuild\tbuild"): - raise Exception("ERROR: unrecognized file subheader '%s'" % header3) - # process lines - numPos = numPosBatch = 0 - listChrPos = collections.defaultdict(list) - setBadBuild = set() - setBadVers = set() - setBadFilter = set() - setBadChr = set() - for line in chmFile: - words = line.split("\t") - rs = words[0].strip() - withdrawn = (int(words[2].strip()) > 0) - chm = words[6].strip() - pos = words[11].strip() - validated = 1 if (int(words[16].strip()) > 0) else 0 - build = reBuild.search(words[21]) - suspect = (int(words[22].strip()) > 0) - - if rs != '' and chm != '' and pos != '': - rs = int(rs) - pos = int(pos) - if not build: - setBadBuild.add(rs) - elif self._grcBuild and self._grcBuild != build.group(1): - setBadVers.add(rs) - elif not (validated or includeUnvalidated): - setBadFilter.add(rs) - elif suspect and not includeSuspect: - setBadFilter.add(rs) - elif withdrawn and not includeWithdrawn: - setBadFilter.add(rs) - elif (fileChm != 'PAR') and (chm != fileChm): - setBadChr.add(rs) - elif (fileChm == 'PAR') and (chm != 'X') and (chm != 'Y'): - setBadChr.add(rs) - else: - if not self._grcBuild: - self._grcBuild = build.group(1) - numPosBatch += 1 - listChrPos[chm].append( (rs,pos,validated) ) - setBadChr.discard(rs) - setBadFilter.discard(rs) - setBadVers.discard(rs) - setBadBuild.discard(rs) - if numPosBatch >= 2500000: - numPos += numPosBatch - numPosBatch = 0 - self.log("processing chromosome %s: %1.1f million so far\n" % (fileChm, numPos/1000000.0)) - # store data - self.log("writing chromosome %s SNPs to the database ...\n" % fileChm) - for chm,listPos in listChrPos.items(): - self.addChromosomeSNPLoci(self._loki.chr_num[chm], listPos) - listChrPos = collections.defaultdict(list) - self.log("writing chromosome %s SNPs to the database completed\n" % fileChm) - #if rs/chm/pos provided - #foreach line in chmFile - self.log("processing chromosome %s SNPs: %d SNP loci\n" % (fileChm, numPos)) - # store data - if listChrPos: - self.log("writing chromosome %s SNPs to the database ...\n" % fileChm) - for chm,listPos in listChrPos.items(): - self.addChromosomeSNPLoci(self._loki.chr_num[chm], listPos) - self.log("writing chromosome %s SNPs to the database completed\n" % fileChm) - - # print results - numPos += numPosBatch - setBadFilter.difference_update(setBadChr) - setBadVers.difference_update(setBadChr, setBadFilter) - setBadBuild.difference_update(setBadChr, setBadFilter, setBadVers) - if setBadBuild: - self.log("WARNING: %d SNPs not mapped to any GRCh build\n" % (len(setBadBuild))) - if setBadVers: - self.log("WARNING: %d SNPs mapped to GRCh build version other than %s\n" % (len(setBadVers),self._grcBuild)) - if setBadFilter: - self.log("WARNING: %d SNPs skipped (unvalidated, suspect and/or withdrawn)\n" % (len(setBadFilter))) - if setBadChr: - self.log("WARNING: %d SNPs on mismatching chromosome\n" % (len(setBadChr))) - listChrPos = setBadBuild = setBadVers = setBadFilter = setBadChr = None - #processChmSNPs() -#Source_dbsnp + self.log("processing SNP roles ...\n") + setRole = set() + numRole = numOrphan = numInc = 0 + setOrphan = set() + funcFile = self.zfile( + list( + filter( + re.compile(r"b([0-9]+)_SNPContigLocusId_(.*)\.bcp\.gz").match, + os.listdir(path), + ) + )[0] + ) + for line in funcFile: + words = list(w.strip() for w in line.split("\t")) + rs = int(words[0]) if words[0] else None + entrez = int(words[5]) if words[5] else None + # genesymbol = words[6] + code = int(words[11]) if words[11] else None + + if rs and entrez and code: + try: + setRole.add((rs, entrez, roleID[code])) + except KeyError: + setOrphan.add(code) + numOrphan += 1 + else: + numInc += 1 + + # write to the database after each 2.5 million, to keep memory usage down + if len(setRole) >= 2500000: + numRole += len(setRole) + self.log( + "processing SNP roles: ~%1.1f million so far\n" + % (numRole / 1000000.0) + ) # TODO: time estimate + self.log("writing SNP roles to the database ...\n") + self.addSNPEntrezRoles(setRole) + setRole = set() + self.log("writing SNP roles to the database completed\n") + + roleID = None + # foreach line in funcFile + numRole += len(setRole) + self.log("processing SNP roles completed: ~%d roles\n" % (numRole,)) + if setRole: + self.log("writing SNP roles to the database ...\n") + self.addSNPEntrezRoles(setRole) + self.log("writing SNP roles to the database completed\n") + setRole = None + + # warn about orphans + if setOrphan: + self.log( + "WARNING: %d roles (%d codes) unrecognized\n" + % (numOrphan, len(setOrphan)) + ) + if numInc: + self.log("WARNING: %d roles incomplete\n" % (numInc,)) + setOrphan = None + # if roles + + # process chromosome report files + # dbSNP chromosome reports use 1-based coordinates since b125, according to: + # http://www.ncbi.nlm.nih.gov/books/NBK44414/#Reports.the_xml_dump_for_build_126_has_a + # This matches LOKI's convention. + reBuild = re.compile("GRCh([0-9]+)") + includeUnvalidated = options["unvalidated"] == "yes" + includeSuspect = options["suspect"] == "yes" + includeWithdrawn = options["withdrawn"] == "yes" + processChmThreads = {} + for fileChm in self._chmList: + processChmThreads[fileChm] = Thread( + target=self.processChmSNPs, + args=( + fileChm, + reBuild, + includeUnvalidated, + includeSuspect, + includeWithdrawn, + path, + ), + ) + processChmThreads[fileChm].start() + # foreach chromosome + for fileChm in self._chmList: + processChmThreads[fileChm].join() + + # store source metadata + self.setSourceBuilds(self._grcBuild, None) + + # update() + + def processChmSNPs( + self, + fileChm, + reBuild, + includeUnvalidated, + includeSuspect, + includeWithdrawn, + path, + ): + self.log("processing chromosome %s SNPs ...\n" % fileChm) + chmFile = self.zfile(path + "/chr_" + fileChm + ".txt.gz") + + # verify file headers + header1 = chmFile.__next__().rstrip() + chmFile.__next__() + chmFile.__next__() + header2 = chmFile.__next__().rstrip() + header3 = chmFile.__next__().rstrip() + chmFile.__next__() + chmFile.__next__() + if not header1.startswith("dbSNP Chromosome Report"): + raise Exception("ERROR: unrecognized file header '%s'" % header1) + if not header2.startswith( + "rs#\tmap\tsnp\tchr\tctg\ttotal\tchr\tctg\tctg\tctg\tctg\tchr\tlocal\tavg\ts.e.\tmax\tvali-\tgeno-\tlink\torig\tupd" + ): + raise Exception("ERROR: unrecognized file subheader '%s'" % header2) + if not header3.startswith( + "\twgt\ttype\thits\thits\thits\t\tacc\tver\tID\tpos\tpos\tloci\thet\thet\tprob\tdated\ttypes\touts\tbuild\tbuild" + ): + raise Exception("ERROR: unrecognized file subheader '%s'" % header3) + # process lines + numPos = numPosBatch = 0 + listChrPos = collections.defaultdict(list) + setBadBuild = set() + setBadVers = set() + setBadFilter = set() + setBadChr = set() + for line in chmFile: + words = line.split("\t") + rs = words[0].strip() + withdrawn = int(words[2].strip()) > 0 + chm = words[6].strip() + pos = words[11].strip() + validated = 1 if (int(words[16].strip()) > 0) else 0 + build = reBuild.search(words[21]) + suspect = int(words[22].strip()) > 0 + + if rs != "" and chm != "" and pos != "": + rs = int(rs) + pos = int(pos) + if not build: + setBadBuild.add(rs) + elif self._grcBuild and self._grcBuild != build.group(1): + setBadVers.add(rs) + elif not (validated or includeUnvalidated): + setBadFilter.add(rs) + elif suspect and not includeSuspect: + setBadFilter.add(rs) + elif withdrawn and not includeWithdrawn: + setBadFilter.add(rs) + elif (fileChm != "PAR") and (chm != fileChm): + setBadChr.add(rs) + elif (fileChm == "PAR") and (chm != "X") and (chm != "Y"): + setBadChr.add(rs) + else: + if not self._grcBuild: + self._grcBuild = build.group(1) + numPosBatch += 1 + listChrPos[chm].append((rs, pos, validated)) + setBadChr.discard(rs) + setBadFilter.discard(rs) + setBadVers.discard(rs) + setBadBuild.discard(rs) + if numPosBatch >= 2500000: + numPos += numPosBatch + numPosBatch = 0 + self.log( + "processing chromosome %s: %1.1f million so far\n" + % (fileChm, numPos / 1000000.0) + ) + # store data + self.log( + "writing chromosome %s SNPs to the database ...\n" % fileChm + ) + for chm, listPos in listChrPos.items(): + self.addChromosomeSNPLoci(self._loki.chr_num[chm], listPos) + listChrPos = collections.defaultdict(list) + self.log( + "writing chromosome %s SNPs to the database completed\n" + % fileChm + ) + # if rs/chm/pos provided + # foreach line in chmFile + self.log("processing chromosome %s SNPs: %d SNP loci\n" % (fileChm, numPos)) + # store data + if listChrPos: + self.log("writing chromosome %s SNPs to the database ...\n" % fileChm) + for chm, listPos in listChrPos.items(): + self.addChromosomeSNPLoci(self._loki.chr_num[chm], listPos) + self.log("writing chromosome %s SNPs to the database completed\n" % fileChm) + + # print results + numPos += numPosBatch + setBadFilter.difference_update(setBadChr) + setBadVers.difference_update(setBadChr, setBadFilter) + setBadBuild.difference_update(setBadChr, setBadFilter, setBadVers) + if setBadBuild: + self.log( + "WARNING: %d SNPs not mapped to any GRCh build\n" % (len(setBadBuild)) + ) + if setBadVers: + self.log( + "WARNING: %d SNPs mapped to GRCh build version other than %s\n" + % (len(setBadVers), self._grcBuild) + ) + if setBadFilter: + self.log( + "WARNING: %d SNPs skipped (unvalidated, suspect and/or withdrawn)\n" + % (len(setBadFilter)) + ) + if setBadChr: + self.log("WARNING: %d SNPs on mismatching chromosome\n" % (len(setBadChr))) + listChrPos = setBadBuild = setBadVers = setBadFilter = setBadChr = None + + # processChmSNPs() + + +# Source_dbsnp diff --git a/loki/loaders/loki_source_entrez.py b/loki/loaders/loki_source_entrez.py index 1dc2001..74c16c2 100644 --- a/loki/loaders/loki_source_entrez.py +++ b/loki/loaders/loki_source_entrez.py @@ -6,481 +6,597 @@ class Source_entrez(loki_source.Source): - - - @classmethod - def getVersionString(cls): - return '2.4 (2022-04-12)' - #getVersionString() - - - @classmethod - def getOptions(cls): - return { - 'locus-tags' : "[yes|no] -- include a gene's 'Locus Tag' as an alias (default: no)", - 'favor-primary' : "[yes|no] -- reduce symbol ambiguity by favoring primary symbols (default: yes)", - 'favor-hist' : "[yes|no] -- reduce symbol ambiguity by favoring primary symbols (default: yes)", - } - #getOptions() - - - def validateOptions(self, options): - for o,v in options.items(): - v = v.strip().lower() - if o in ('locus-tags','favor-primary','favor-hist'): - if 'yes'.startswith(v): - v = 'yes' - elif 'no'.startswith(v): - v = 'no' - else: - return "%s must be 'yes' or 'no'" % o - else: - return "unknown option '%s'" % o - options[o] = v - return True - #validateOptions() - - - def download(self, options, path): - # download the latest source files -# self.downloadFilesFromFTP('ftp.ncbi.nih.gov', { -# 'Homo_sapiens.gene_info.gz': '/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz', -# 'gene2refseq.gz': '/gene/DATA/gene2refseq.gz', -# 'gene_history.gz': '/gene/DATA/gene_history.gz', -# 'gene2ensembl.gz': '/gene/DATA/gene2ensembl.gz', -# 'gene2unigene': '/gene/DATA/ARCHIVE/gene2unigene', -# 'gene_refseq_uniprotkb_collab.gz': '/gene/DATA/gene_refseq_uniprotkb_collab.gz', -# }) -# self.downloadFilesFromFTP('ftp.uniprot.org', { -# 'HUMAN_9606_idmapping_selected.tab.gz': '/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz', -# }) - - self.downloadFilesFromHTTP('ftp.ncbi.nih.gov', { - path+'/Homo_sapiens.gene_info.gz': '/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz', - path+'/gene2refseq.gz': '/gene/DATA/gene2refseq.gz', - path+'/gene_history.gz': '/gene/DATA/gene_history.gz', - path+'/gene2ensembl.gz': '/gene/DATA/gene2ensembl.gz', - path+'/gene2unigene': '/gene/DATA/ARCHIVE/gene2unigene', - path+'/gene_refseq_uniprotkb_collab.gz': '/gene/DATA/gene_refseq_uniprotkb_collab.gz', - }) - self.downloadFilesFromHTTP('ftp.ebi.ac.uk', { - path+'/HUMAN_9606_idmapping_selected.tab.gz': '/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz', - }) - - return [ - path+'/Homo_sapiens.gene_info.gz', - path+'/gene2refseq.gz', - path+'/gene_history.gz', - path+'/gene2ensembl.gz', - path+'/gene2unigene', - path+'/gene_refseq_uniprotkb_collab.gz', - path+'/HUMAN_9606_idmapping_selected.tab.gz' - ] - #download() - - - def update(self, options, path): - # clear out all old data from this source - self.log("deleting old records from the database ...\n") - self.deleteAll() - self.log("deleting old records from the database completed\n") - - # get or create the required metadata records - ldprofileID = self.addLDProfiles([ - ('', 'no LD adjustment', None, None), - ]) - namespaceID = self.addNamespaces([ - ('symbol', 0), - ('entrez_gid', 0), - ('refseq_gid', 0), - ('refseq_pid', 1), - ('ensembl_gid', 0), - ('ensembl_pid', 1), - ('hgnc_id', 0), - ('mim_id', 0), - ('hprd_id', 0), - ('vega_id', 0), - ('rgd_id', 0), - ('mirbase_id', 0), - ('unigene_gid', 0), - ('uniprot_gid', 0), - ('uniprot_pid', 1), - ]) - typeID = self.addTypes([ - ('gene',), - ]) - - nsNames = { ns:set() for ns in namespaceID } - nsNameNames = { ns:set() for ns in namespaceID } - numNames = numNameNames = numNameRefs = 0 - - # process genes (no header!) - self.log("processing genes ...\n") - entrezGene = dict() - entrezChm = dict() - primaryEntrez = dict() - xrefNS = { - 'Ensembl_G': 'ensembl_gid', - 'Ensembl_T': 'ensembl_gid', - 'Ensembl_P': 'ensembl_pid', - 'HGNC': 'hgnc_id', - 'MIM': 'mim_id', - 'HPRD': 'hprd_id', - 'Vega': 'vega_id', - 'RGD': 'rgd_id', - 'miRBase': 'mirbase_id', - } - geneFile = self.zfile(path+'/Homo_sapiens.gene_info.gz') #TODO:context manager,iterator - for line in geneFile: - # quickly filter out all non-9606 (human) taxonomies before taking the time to split() - if line.startswith("9606\t"): - words = line.rstrip().split("\t") - entrezID = int(words[1]) - symbol = words[2] - aliases = words[4].split("|") if words[4] != "-" else list() - if options.get('locus-tags','no') == 'yes' and words[3] != "-": - aliases.append(words[3]) - xrefs = words[5].split("|") if words[5] != "-" else list() - chm = words[6] - desc = words[8] - - entrezGene[entrezID] = (symbol,desc) - entrezChm[entrezID] = chm - if symbol not in primaryEntrez: - primaryEntrez[symbol] = entrezID - elif primaryEntrez[symbol] != entrezID: - primaryEntrez[symbol] = False - - # entrezID as a name for itself looks funny here, but later on - # we'll be translating the target entrezID to biopolymer_id and - # adding more historical entrezID aliases - nsNames['entrez_gid'].add( (entrezID,entrezID) ) - nsNames['symbol'].add( (entrezID,symbol) ) - for alias in aliases: - nsNames['symbol'].add( (entrezID,alias) ) - for xref in xrefs: - xrefDB,xrefID = xref.split(":",1) - # turn ENSG/ENSP/ENST into Ensembl_X - if xrefDB == "Ensembl" and xrefID.startswith("ENS") and len(xrefID) > 3: - xrefDB = "Ensembl_%c" % xrefID[3] - if xrefDB in xrefNS: - nsNames[xrefNS[xrefDB]].add( (entrezID,xrefID) ) - #if taxonomy is 9606 (human) - #foreach line in geneFile - - # delete any symbol alias which is also the primary name of exactly one other gene - if options.get('favor-primary','yes') == 'yes': - dupe = set() - for alias in nsNames['symbol']: - entrezID = alias[0] - symbol = alias[1] - if (symbol in primaryEntrez) and (primaryEntrez[symbol] != False) and (primaryEntrez[symbol] != entrezID): - dupe.add(alias) - nsNames['symbol'] -= dupe - dupe = None - #if favor-primary - - # print stats - numGenes = len(entrezGene) - numNames0 = numNames - numNames = sum(len(nsNames[ns]) for ns in nsNames) - self.log("processing genes completed: %d genes, %d identifiers\n" % (numGenes,numNames-numNames0)) - - # store genes - self.log("writing genes to the database ...\n") - listEntrez = entrezGene.keys() - listBID = self.addTypedBiopolymers(typeID['gene'], (entrezGene[entrezID] for entrezID in listEntrez)) - entrezBID = dict(zip(listEntrez,listBID)) - numGenes = len(entrezBID) - self.log("writing genes to the database completed: %d genes\n" % (numGenes)) - entrezGene = None - - # translate target entrezID to biopolymer_id in nsNames - for ns in nsNames: - names = set( (entrezBID[name[0]],name[1]) for name in nsNames[ns] if name[0] in entrezBID ) - nsNames[ns] = names - numNames = sum(len(nsNames[ns]) for ns in nsNames) - - # process gene regions - # Entrez sequences use 0-based closed intervals, according to: - # http://www.ncbi.nlm.nih.gov/books/NBK3840/#genefaq.Representation_of_nucleotide_pos - # and comparison of web-reported boundary coordinates to gene length (len = end - start + 1). - # Since LOKI uses 1-based closed intervals, we add 1 to all coordinates. - self.log("processing gene regions ...\n") - reBuild = re.compile('GRCh([0-9]+)') - grcBuild = None - buildGenes = collections.defaultdict(set) - buildRegions = collections.defaultdict(set) - setOrphan = set() - setBadNC = set() - setBadBuild = set() - setBadChr = set() - refseqBIDs = collections.defaultdict(set) - regionFile = self.zfile(path+'/gene2refseq.gz') #TODO:context manager,iterator - header = regionFile.__next__().rstrip() - if not ( - header.startswith("#Format: tax_id GeneID status RNA_nucleotide_accession.version RNA_nucleotide_gi protein_accession.version protein_gi genomic_nucleotide_accession.version genomic_nucleotide_gi start_position_on_the_genomic_accession end_position_on_the_genomic_accession orientation assembly") # "(tab is used as a separator, pound sign - start of a comment)" - or header.startswith("#tax_id GeneID status RNA_nucleotide_accession.version RNA_nucleotide_gi protein_accession.version protein_gi genomic_nucleotide_accession.version genomic_nucleotide_gi start_position_on_the_genomic_accession end_position_on_the_genomic_accession orientation assembly") # " mature_peptide_accession.version mature_peptide_gi Symbol" - ): - self.log(" ERROR: unrecognized file header\n") - self.log("%s\n" % header) - else: - for line in regionFile: - # skip non-9606 (human) taxonomies before taking the time to split() - if not line.startswith("9606\t"): - continue - - # grab relevant columns - words = line.split("\t") - entrezID = int(words[1]) - rnaAcc = words[3].rsplit('.',1)[0] if words[3] != "-" else None - proAcc = words[5].rsplit('.',1)[0] if words[5] != "-" else None - genAcc = words[7].rsplit('.',1)[0] if words[7] != "-" else None - posMin = (int(words[9])+1) if words[9] != "-" else None - posMax = (int(words[10])+1) if words[10] != "-" else None - build = reBuild.search(words[12].rstrip() if (len(words) > 12 and words[12] != "-") else '') - - # skip unrecognized IDs - if entrezID not in entrezBID: - setOrphan.add(entrezID) - continue - - # store rna and protein sequence RefSeq IDs - # (don't store genAcc, there's only one per chromosome) - if rnaAcc: - nsNames['refseq_gid'].add( (entrezBID[entrezID],rnaAcc) ) - if proAcc: - nsNames['refseq_pid'].add( (entrezBID[entrezID],proAcc) ) - refseqBIDs[proAcc].add(entrezBID[entrezID]) - - # skip non-whole-chromosome regions - # (refseq accession types: http://www.ncbi.nlm.nih.gov/RefSeq/key.html) - if not (genAcc and genAcc.startswith('NC_')): - setBadNC.add(entrezID) - continue - elif not build: - setBadBuild.add(entrezID) - continue - - # skip chromosome mismatches - if genAcc in ('NC_001807','NC_012920'): #TODO: avoid hardcoding this mapping - chm = self._loki.chr_num.get('MT') - else: - chm = self._loki.chr_num.get(genAcc[3:].lstrip('0')) - if not chm: - setBadChr.add(entrezID) - continue - elif (entrezID in entrezChm) and (self._loki.chr_name[chm] not in entrezChm[entrezID].split('|')): - # TODO: make sure we want to ignore any gene region with an ambiguous chromosome - # (i.e. gene_info says one thing, gene2refseq says another) - #print "%s %s -> %s" % (entrezID,entrezChm[entrezID],self._loki.chr_name[chm]) - #100293744 X -> Y - #100302657 3 -> 15 - #100418703 Y -> X - #100507426 Y -> X - setBadChr.add(entrezID) - continue - - # store the region by build version number, so we can pick the majority build later - buildGenes[build.group(1)].add(entrezID) - buildRegions[build.group(1)].add( (entrezBID[entrezID],chm,posMin,posMax) ) - #foreach line in regionFile - - # identify majority build version - grcBuild = max(buildRegions, key=lambda build: len(buildRegions[build])) - setBadVers = set() - for build,genes in buildGenes.items(): - if build != grcBuild: - setBadVers.update(genes) - - # print stats - setBadVers.difference_update(buildGenes[grcBuild]) - setBadChr.difference_update(buildGenes[grcBuild], setBadVers) - setBadBuild.difference_update(buildGenes[grcBuild], setBadVers, setBadChr) - setBadNC.difference_update(buildGenes[grcBuild], setBadVers, setBadChr, setBadNC) - numRegions = len(buildRegions[grcBuild]) - numGenes = len(buildGenes[grcBuild]) - numNames0 = numNames - numNames = sum(len(nsNames[ns]) for ns in nsNames) - self.log("processing gene regions completed: %d regions (%d genes), %d identifiers\n" % (numRegions,numGenes,numNames-numNames0)) - self.logPush() - if setOrphan: - self.log("WARNING: %d regions for undefnied EntrezIDs\n" % (len(setOrphan))) - if setBadNC: - self.log("WARNING: %d genes not mapped to whole chromosome\n" % (len(setBadNC))) - if setBadBuild: - self.log("WARNING: %d genes not mapped to any GRCh build\n" % (len(setBadBuild))) - if setBadVers: - self.log("WARNING: %d genes mapped to GRCh build version other than %s\n" % (len(setBadVers),grcBuild)) - if setBadChr: - self.log("WARNING: %d genes on mismatching chromosome\n" % (len(setBadChr))) - self.logPop() - entrezChm = setOrphan = setBadNC = setBadBuild = setBadChr = setBadVers = buildGenes = None - - # store gene regions - self.log("writing gene regions to the database ...\n") - numRegions = len(buildRegions[grcBuild]) - self.addBiopolymerLDProfileRegions(ldprofileID[''], buildRegions[grcBuild]) - self.log("writing gene regions to the database completed: %d regions\n" % (numRegions)) - buildRegions = None - #if gene regions header ok - - # process historical gene names - self.log("processing historical gene names ...\n") - entrezUpdate = {} - historyEntrez = {} - histFile = self.zfile(path+'/gene_history.gz') #TODO:context manager,iterator - header = histFile.__next__().rstrip() - if not ( - header.startswith("#Format: tax_id GeneID Discontinued_GeneID Discontinued_Symbol") # "Discontinue_Date (tab is used as a separator, pound sign - start of a comment)" - or header.startswith("#tax_id GeneID Discontinued_GeneID Discontinued_Symbol") # "Discontinue_Date" - ): - self.log(" ERROR: unrecognized file header\n") - self.log("%s\n" % header) - else: - for line in histFile: - # quickly filter out all non-9606 (human) taxonomies before taking the time to split() - if line.startswith("9606\t"): - words = line.split("\t") - entrezID = int(words[1]) if words[1] != "-" else None - oldEntrez = int(words[2]) if words[2] != "-" else None - oldName = words[3] if words[3] != "-" else None - - if entrezID and entrezID in entrezBID: - if oldEntrez and oldEntrez != entrezID: - entrezUpdate[oldEntrez] = entrezID - nsNames['entrez_gid'].add( (entrezBID[entrezID],oldEntrez) ) - if oldName and (oldName not in primaryEntrez or primaryEntrez[oldName] == False): - if oldName not in historyEntrez: - historyEntrez[oldName] = entrezID - elif historyEntrez[oldName] != entrezID: - historyEntrez[oldName] = False - nsNames['symbol'].add( (entrezBID[entrezID],oldName) ) - #if taxonomy is 9606 (human) - #foreach line in histFile - - # delete any symbol alias which is also the historical name of exactly one other gene - if options.get('favor-hist','yes') == 'yes': - dupe = set() - for alias in nsNames['symbol']: - entrezID = alias[0] - symbol = alias[1] - if (symbol in historyEntrez) and (historyEntrez[symbol] != False) and (historyEntrez[symbol] != entrezID): - dupe.add(alias) - nsNames['symbol'] -= dupe - dupe = None - #if favor-hist - - # print stats - numNames0 = numNames - numNames = sum(len(nsNames[ns]) for ns in nsNames) - self.log("processing historical gene names completed: %d identifiers\n" % (numNames-numNames0)) - #if historical name header ok - - # process ensembl gene names - self.log("processing ensembl gene names ...\n") - ensFile = self.zfile(path+'/gene2ensembl.gz') #TODO:context manager,iterator - header = ensFile.__next__().rstrip() - if not ( - header.startswith("#Format: tax_id GeneID Ensembl_gene_identifier RNA_nucleotide_accession.version Ensembl_rna_identifier protein_accession.version Ensembl_protein_identifier") # "(tab is used as a separator, pound sign - start of a comment)" - or header.startswith("#tax_id GeneID Ensembl_gene_identifier RNA_nucleotide_accession.version Ensembl_rna_identifier protein_accession.version Ensembl_protein_identifier") - ): - self.log(" ERROR: unrecognized file header\n") - self.log("%s\n" % header) - else: - for line in ensFile: - # quickly filter out all non-9606 (human) taxonomies before taking the time to split() - if line.startswith("9606\t"): - words = line.split("\t") - entrezID = int(words[1]) - ensemblG = words[2] if words[2] != "-" else None - ensemblT = words[4] if words[4] != "-" else None - ensemblP = words[6] if words[6] != "-" else None - - if ensemblG or ensemblT or ensemblP: - while entrezID and (entrezID in entrezUpdate): - entrezID = entrezUpdate[entrezID] - - if entrezID and (entrezID in entrezBID): - if ensemblG: - nsNames['ensembl_gid'].add( (entrezBID[entrezID],ensemblG) ) - if ensemblT: - nsNames['ensembl_gid'].add( (entrezBID[entrezID],ensemblT) ) - if ensemblP: - nsNames['ensembl_pid'].add( (entrezBID[entrezID],ensemblP) ) - #if taxonomy is 9606 (human) - #foreach line in ensFile - - # print stats - numNames0 = numNames - numNames = sum(len(nsNames[ns]) for ns in nsNames) - self.log("processing ensembl gene names completed: %d identifiers\n" % (numNames-numNames0)) - #if ensembl name header ok - - # process unigene gene names - self.log("processing unigene gene names ...\n") - with open(path+'/gene2unigene','r') as ugFile: - header = ugFile.__next__().rstrip() - if not ( - header.startswith("#Format: GeneID UniGene_cluster") # "(tab is used as a separator, pound sign - start of a comment)" - or header.startswith("#GeneID UniGene_cluster") - ): - self.log(" ERROR: unrecognized file header\n") - self.log("%s\n" % header) - else: - for line in ugFile: - words = line.rstrip().split("\t") - entrezID = int(words[0]) if words[0] != "-" else None - unigeneID = words[1] if words[1] != "-" else None - - while entrezID and (entrezID in entrezUpdate): - entrezID = entrezUpdate[entrezID] - - # there will be lots of extraneous mappings for genes of other species - if entrezID and (entrezID in entrezBID) and unigeneID: - nsNames['unigene_gid'].add( (entrezBID[entrezID],unigeneID) ) - #foreach line in ugFile - - # print stats - numNames0 = numNames - numNames = sum(len(nsNames[ns]) for ns in nsNames) - self.log("processing unigene gene names completed: %d identifiers\n" % (numNames-numNames0)) - #if unigene name header ok - #with ugFile - - if True: - # process uniprot gene names from entrez - self.log("processing uniprot gene names ...\n") - upFile = self.zfile(path+'/gene_refseq_uniprotkb_collab.gz') #TODO:context manager,iterator - header = upFile.__next__().rstrip() - if not ( - header.startswith("#Format: NCBI_protein_accession UniProtKB_protein_accession") # "(tab is used as a separator, pound sign - start of a comment)" - or header.startswith("#NCBI_protein_accession UniProtKB_protein_accession") - ): - self.log(" ERROR: unrecognized file header\n") - self.log("%s\n" % header) - else: - for line in upFile: - words = line.split("\t") - proteinAcc = words[0].rsplit('.',1)[0] if words[0] != "-" else None - uniprotAcc = words[1] if words[1] != "-" else None - - # there will be tons of identifiers missing from refseqBIDs because they're non-human - if proteinAcc and (proteinAcc in refseqBIDs) and uniprotAcc: - for biopolymerID in refseqBIDs[proteinAcc]: - nsNames['uniprot_pid'].add( (biopolymerID,uniprotAcc) ) - #foreach line in upFile - - # print stats - numNames0 = numNames - numNames = sum(len(nsNames[ns]) for ns in nsNames) - self.log("processing uniprot gene names completed: %d identifiers\n" % (numNames-numNames0)) - #if header ok - else: - # process uniprot gene names from uniprot (no header!) - self.log("processing uniprot gene names ...\n") - upFile = self.zfile(path+'/HUMAN_9606_idmapping_selected.tab.gz') #TODO:context manager,iterator - """ /* ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README */ + + @classmethod + def getVersionString(cls): + return "2.4 (2022-04-12)" + + # getVersionString() + + @classmethod + def getOptions(cls): + return { + "locus-tags": "[yes|no] -- include a gene's 'Locus Tag' as an alias (default: no)", + "favor-primary": "[yes|no] -- reduce symbol ambiguity by favoring primary symbols (default: yes)", + "favor-hist": "[yes|no] -- reduce symbol ambiguity by favoring primary symbols (default: yes)", + } + + # getOptions() + + def validateOptions(self, options): + for o, v in options.items(): + v = v.strip().lower() + if o in ("locus-tags", "favor-primary", "favor-hist"): + if "yes".startswith(v): + v = "yes" + elif "no".startswith(v): + v = "no" + else: + return "%s must be 'yes' or 'no'" % o + else: + return "unknown option '%s'" % o + options[o] = v + return True + + # validateOptions() + + def download(self, options, path): + # download the latest source files + # self.downloadFilesFromFTP('ftp.ncbi.nih.gov', { + # 'Homo_sapiens.gene_info.gz': '/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz', + # 'gene2refseq.gz': '/gene/DATA/gene2refseq.gz', + # 'gene_history.gz': '/gene/DATA/gene_history.gz', + # 'gene2ensembl.gz': '/gene/DATA/gene2ensembl.gz', + # 'gene2unigene': '/gene/DATA/ARCHIVE/gene2unigene', + # 'gene_refseq_uniprotkb_collab.gz': '/gene/DATA/gene_refseq_uniprotkb_collab.gz', + # }) + # self.downloadFilesFromFTP('ftp.uniprot.org', { + # 'HUMAN_9606_idmapping_selected.tab.gz': '/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz', + # }) + + self.downloadFilesFromHTTP( + "ftp.ncbi.nih.gov", + { + path + + "/Homo_sapiens.gene_info.gz": "/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz", + path + "/gene2refseq.gz": "/gene/DATA/gene2refseq.gz", + path + "/gene_history.gz": "/gene/DATA/gene_history.gz", + path + "/gene2ensembl.gz": "/gene/DATA/gene2ensembl.gz", + path + "/gene2unigene": "/gene/DATA/ARCHIVE/gene2unigene", + path + + "/gene_refseq_uniprotkb_collab.gz": "/gene/DATA/gene_refseq_uniprotkb_collab.gz", + }, + ) + self.downloadFilesFromHTTP( + "ftp.ebi.ac.uk", + { + path + + "/HUMAN_9606_idmapping_selected.tab.gz": "/pub/databases/uniprot/current_release/knowledgebase/idmapping/by_organism/HUMAN_9606_idmapping_selected.tab.gz", + }, + ) + + return [ + path + "/Homo_sapiens.gene_info.gz", + path + "/gene2refseq.gz", + path + "/gene_history.gz", + path + "/gene2ensembl.gz", + path + "/gene2unigene", + path + "/gene_refseq_uniprotkb_collab.gz", + path + "/HUMAN_9606_idmapping_selected.tab.gz", + ] + + # download() + + def update(self, options, path): + # clear out all old data from this source + self.log("deleting old records from the database ...\n") + self.deleteAll() + self.log("deleting old records from the database completed\n") + + # get or create the required metadata records + ldprofileID = self.addLDProfiles( + [ + ("", "no LD adjustment", None, None), + ] + ) + namespaceID = self.addNamespaces( + [ + ("symbol", 0), + ("entrez_gid", 0), + ("refseq_gid", 0), + ("refseq_pid", 1), + ("ensembl_gid", 0), + ("ensembl_pid", 1), + ("hgnc_id", 0), + ("mim_id", 0), + ("hprd_id", 0), + ("vega_id", 0), + ("rgd_id", 0), + ("mirbase_id", 0), + ("unigene_gid", 0), + ("uniprot_gid", 0), + ("uniprot_pid", 1), + ] + ) + typeID = self.addTypes( + [ + ("gene",), + ] + ) + + nsNames = {ns: set() for ns in namespaceID} + nsNameNames = {ns: set() for ns in namespaceID} + numNames = numNameNames = numNameRefs = 0 + + # process genes (no header!) + self.log("processing genes ...\n") + entrezGene = dict() + entrezChm = dict() + primaryEntrez = dict() + xrefNS = { + "Ensembl_G": "ensembl_gid", + "Ensembl_T": "ensembl_gid", + "Ensembl_P": "ensembl_pid", + "HGNC": "hgnc_id", + "MIM": "mim_id", + "HPRD": "hprd_id", + "Vega": "vega_id", + "RGD": "rgd_id", + "miRBase": "mirbase_id", + } + geneFile = self.zfile( + path + "/Homo_sapiens.gene_info.gz" + ) # TODO:context manager,iterator + for line in geneFile: + # quickly filter out all non-9606 (human) taxonomies before taking the time to split() + if line.startswith("9606\t"): + words = line.rstrip().split("\t") + entrezID = int(words[1]) + symbol = words[2] + aliases = words[4].split("|") if words[4] != "-" else list() + if options.get("locus-tags", "no") == "yes" and words[3] != "-": + aliases.append(words[3]) + xrefs = words[5].split("|") if words[5] != "-" else list() + chm = words[6] + desc = words[8] + + entrezGene[entrezID] = (symbol, desc) + entrezChm[entrezID] = chm + if symbol not in primaryEntrez: + primaryEntrez[symbol] = entrezID + elif primaryEntrez[symbol] != entrezID: + primaryEntrez[symbol] = False + + # entrezID as a name for itself looks funny here, but later on + # we'll be translating the target entrezID to biopolymer_id and + # adding more historical entrezID aliases + nsNames["entrez_gid"].add((entrezID, entrezID)) + nsNames["symbol"].add((entrezID, symbol)) + for alias in aliases: + nsNames["symbol"].add((entrezID, alias)) + for xref in xrefs: + xrefDB, xrefID = xref.split(":", 1) + # turn ENSG/ENSP/ENST into Ensembl_X + if ( + xrefDB == "Ensembl" + and xrefID.startswith("ENS") + and len(xrefID) > 3 + ): + xrefDB = "Ensembl_%c" % xrefID[3] + if xrefDB in xrefNS: + nsNames[xrefNS[xrefDB]].add((entrezID, xrefID)) + # if taxonomy is 9606 (human) + # foreach line in geneFile + + # delete any symbol alias which is also the primary name of exactly one other gene + if options.get("favor-primary", "yes") == "yes": + dupe = set() + for alias in nsNames["symbol"]: + entrezID = alias[0] + symbol = alias[1] + if ( + (symbol in primaryEntrez) + and (primaryEntrez[symbol] != False) + and (primaryEntrez[symbol] != entrezID) + ): + dupe.add(alias) + nsNames["symbol"] -= dupe + dupe = None + # if favor-primary + + # print stats + numGenes = len(entrezGene) + numNames0 = numNames + numNames = sum(len(nsNames[ns]) for ns in nsNames) + self.log( + "processing genes completed: %d genes, %d identifiers\n" + % (numGenes, numNames - numNames0) + ) + + # store genes + self.log("writing genes to the database ...\n") + listEntrez = entrezGene.keys() + listBID = self.addTypedBiopolymers( + typeID["gene"], (entrezGene[entrezID] for entrezID in listEntrez) + ) + entrezBID = dict(zip(listEntrez, listBID)) + numGenes = len(entrezBID) + self.log("writing genes to the database completed: %d genes\n" % (numGenes)) + entrezGene = None + + # translate target entrezID to biopolymer_id in nsNames + for ns in nsNames: + names = set( + (entrezBID[name[0]], name[1]) + for name in nsNames[ns] + if name[0] in entrezBID + ) + nsNames[ns] = names + numNames = sum(len(nsNames[ns]) for ns in nsNames) + + # process gene regions + # Entrez sequences use 0-based closed intervals, according to: + # http://www.ncbi.nlm.nih.gov/books/NBK3840/#genefaq.Representation_of_nucleotide_pos + # and comparison of web-reported boundary coordinates to gene length (len = end - start + 1). + # Since LOKI uses 1-based closed intervals, we add 1 to all coordinates. + self.log("processing gene regions ...\n") + reBuild = re.compile("GRCh([0-9]+)") + grcBuild = None + buildGenes = collections.defaultdict(set) + buildRegions = collections.defaultdict(set) + setOrphan = set() + setBadNC = set() + setBadBuild = set() + setBadChr = set() + refseqBIDs = collections.defaultdict(set) + regionFile = self.zfile( + path + "/gene2refseq.gz" + ) # TODO:context manager,iterator + header = regionFile.__next__().rstrip() + if not ( + header.startswith( + "#Format: tax_id GeneID status RNA_nucleotide_accession.version RNA_nucleotide_gi protein_accession.version protein_gi genomic_nucleotide_accession.version genomic_nucleotide_gi start_position_on_the_genomic_accession end_position_on_the_genomic_accession orientation assembly" + ) # "(tab is used as a separator, pound sign - start of a comment)" + or header.startswith( + "#tax_id GeneID status RNA_nucleotide_accession.version RNA_nucleotide_gi protein_accession.version protein_gi genomic_nucleotide_accession.version genomic_nucleotide_gi start_position_on_the_genomic_accession end_position_on_the_genomic_accession orientation assembly" + ) # " mature_peptide_accession.version mature_peptide_gi Symbol" + ): + self.log(" ERROR: unrecognized file header\n") + self.log("%s\n" % header) + else: + for line in regionFile: + # skip non-9606 (human) taxonomies before taking the time to split() + if not line.startswith("9606\t"): + continue + + # grab relevant columns + words = line.split("\t") + entrezID = int(words[1]) + rnaAcc = words[3].rsplit(".", 1)[0] if words[3] != "-" else None + proAcc = words[5].rsplit(".", 1)[0] if words[5] != "-" else None + genAcc = words[7].rsplit(".", 1)[0] if words[7] != "-" else None + posMin = (int(words[9]) + 1) if words[9] != "-" else None + posMax = (int(words[10]) + 1) if words[10] != "-" else None + build = reBuild.search( + words[12].rstrip() if (len(words) > 12 and words[12] != "-") else "" + ) + + # skip unrecognized IDs + if entrezID not in entrezBID: + setOrphan.add(entrezID) + continue + + # store rna and protein sequence RefSeq IDs + # (don't store genAcc, there's only one per chromosome) + if rnaAcc: + nsNames["refseq_gid"].add((entrezBID[entrezID], rnaAcc)) + if proAcc: + nsNames["refseq_pid"].add((entrezBID[entrezID], proAcc)) + refseqBIDs[proAcc].add(entrezBID[entrezID]) + + # skip non-whole-chromosome regions + # (refseq accession types: http://www.ncbi.nlm.nih.gov/RefSeq/key.html) + if not (genAcc and genAcc.startswith("NC_")): + setBadNC.add(entrezID) + continue + elif not build: + setBadBuild.add(entrezID) + continue + + # skip chromosome mismatches + if genAcc in ( + "NC_001807", + "NC_012920", + ): # TODO: avoid hardcoding this mapping + chm = self._loki.chr_num.get("MT") + else: + chm = self._loki.chr_num.get(genAcc[3:].lstrip("0")) + if not chm: + setBadChr.add(entrezID) + continue + elif (entrezID in entrezChm) and ( + self._loki.chr_name[chm] not in entrezChm[entrezID].split("|") + ): + # TODO: make sure we want to ignore any gene region with an ambiguous chromosome + # (i.e. gene_info says one thing, gene2refseq says another) + # print "%s %s -> %s" % (entrezID,entrezChm[entrezID],self._loki.chr_name[chm]) + # 100293744 X -> Y + # 100302657 3 -> 15 + # 100418703 Y -> X + # 100507426 Y -> X + setBadChr.add(entrezID) + continue + + # store the region by build version number, so we can pick the majority build later + buildGenes[build.group(1)].add(entrezID) + buildRegions[build.group(1)].add( + (entrezBID[entrezID], chm, posMin, posMax) + ) + # foreach line in regionFile + + # identify majority build version + grcBuild = max(buildRegions, key=lambda build: len(buildRegions[build])) + setBadVers = set() + for build, genes in buildGenes.items(): + if build != grcBuild: + setBadVers.update(genes) + + # print stats + setBadVers.difference_update(buildGenes[grcBuild]) + setBadChr.difference_update(buildGenes[grcBuild], setBadVers) + setBadBuild.difference_update(buildGenes[grcBuild], setBadVers, setBadChr) + setBadNC.difference_update( + buildGenes[grcBuild], setBadVers, setBadChr, setBadNC + ) + numRegions = len(buildRegions[grcBuild]) + numGenes = len(buildGenes[grcBuild]) + numNames0 = numNames + numNames = sum(len(nsNames[ns]) for ns in nsNames) + self.log( + "processing gene regions completed: %d regions (%d genes), %d identifiers\n" + % (numRegions, numGenes, numNames - numNames0) + ) + self.logPush() + if setOrphan: + self.log( + "WARNING: %d regions for undefnied EntrezIDs\n" % (len(setOrphan)) + ) + if setBadNC: + self.log( + "WARNING: %d genes not mapped to whole chromosome\n" + % (len(setBadNC)) + ) + if setBadBuild: + self.log( + "WARNING: %d genes not mapped to any GRCh build\n" + % (len(setBadBuild)) + ) + if setBadVers: + self.log( + "WARNING: %d genes mapped to GRCh build version other than %s\n" + % (len(setBadVers), grcBuild) + ) + if setBadChr: + self.log( + "WARNING: %d genes on mismatching chromosome\n" % (len(setBadChr)) + ) + self.logPop() + entrezChm = setOrphan = setBadNC = setBadBuild = setBadChr = setBadVers = ( + buildGenes + ) = None + + # store gene regions + self.log("writing gene regions to the database ...\n") + numRegions = len(buildRegions[grcBuild]) + self.addBiopolymerLDProfileRegions(ldprofileID[""], buildRegions[grcBuild]) + self.log( + "writing gene regions to the database completed: %d regions\n" + % (numRegions) + ) + buildRegions = None + # if gene regions header ok + + # process historical gene names + self.log("processing historical gene names ...\n") + entrezUpdate = {} + historyEntrez = {} + histFile = self.zfile( + path + "/gene_history.gz" + ) # TODO:context manager,iterator + header = histFile.__next__().rstrip() + if not ( + header.startswith( + "#Format: tax_id GeneID Discontinued_GeneID Discontinued_Symbol" + ) # "Discontinue_Date (tab is used as a separator, pound sign - start of a comment)" + or header.startswith( + "#tax_id GeneID Discontinued_GeneID Discontinued_Symbol" + ) # "Discontinue_Date" + ): + self.log(" ERROR: unrecognized file header\n") + self.log("%s\n" % header) + else: + for line in histFile: + # quickly filter out all non-9606 (human) taxonomies before taking the time to split() + if line.startswith("9606\t"): + words = line.split("\t") + entrezID = int(words[1]) if words[1] != "-" else None + oldEntrez = int(words[2]) if words[2] != "-" else None + oldName = words[3] if words[3] != "-" else None + + if entrezID and entrezID in entrezBID: + if oldEntrez and oldEntrez != entrezID: + entrezUpdate[oldEntrez] = entrezID + nsNames["entrez_gid"].add((entrezBID[entrezID], oldEntrez)) + if oldName and ( + oldName not in primaryEntrez + or primaryEntrez[oldName] == False + ): + if oldName not in historyEntrez: + historyEntrez[oldName] = entrezID + elif historyEntrez[oldName] != entrezID: + historyEntrez[oldName] = False + nsNames["symbol"].add((entrezBID[entrezID], oldName)) + # if taxonomy is 9606 (human) + # foreach line in histFile + + # delete any symbol alias which is also the historical name of exactly one other gene + if options.get("favor-hist", "yes") == "yes": + dupe = set() + for alias in nsNames["symbol"]: + entrezID = alias[0] + symbol = alias[1] + if ( + (symbol in historyEntrez) + and (historyEntrez[symbol] != False) + and (historyEntrez[symbol] != entrezID) + ): + dupe.add(alias) + nsNames["symbol"] -= dupe + dupe = None + # if favor-hist + + # print stats + numNames0 = numNames + numNames = sum(len(nsNames[ns]) for ns in nsNames) + self.log( + "processing historical gene names completed: %d identifiers\n" + % (numNames - numNames0) + ) + # if historical name header ok + + # process ensembl gene names + self.log("processing ensembl gene names ...\n") + ensFile = self.zfile(path + "/gene2ensembl.gz") # TODO:context manager,iterator + header = ensFile.__next__().rstrip() + if not ( + header.startswith( + "#Format: tax_id GeneID Ensembl_gene_identifier RNA_nucleotide_accession.version Ensembl_rna_identifier protein_accession.version Ensembl_protein_identifier" + ) # "(tab is used as a separator, pound sign - start of a comment)" + or header.startswith( + "#tax_id GeneID Ensembl_gene_identifier RNA_nucleotide_accession.version Ensembl_rna_identifier protein_accession.version Ensembl_protein_identifier" + ) + ): + self.log(" ERROR: unrecognized file header\n") + self.log("%s\n" % header) + else: + for line in ensFile: + # quickly filter out all non-9606 (human) taxonomies before taking the time to split() + if line.startswith("9606\t"): + words = line.split("\t") + entrezID = int(words[1]) + ensemblG = words[2] if words[2] != "-" else None + ensemblT = words[4] if words[4] != "-" else None + ensemblP = words[6] if words[6] != "-" else None + + if ensemblG or ensemblT or ensemblP: + while entrezID and (entrezID in entrezUpdate): + entrezID = entrezUpdate[entrezID] + + if entrezID and (entrezID in entrezBID): + if ensemblG: + nsNames["ensembl_gid"].add( + (entrezBID[entrezID], ensemblG) + ) + if ensemblT: + nsNames["ensembl_gid"].add( + (entrezBID[entrezID], ensemblT) + ) + if ensemblP: + nsNames["ensembl_pid"].add( + (entrezBID[entrezID], ensemblP) + ) + # if taxonomy is 9606 (human) + # foreach line in ensFile + + # print stats + numNames0 = numNames + numNames = sum(len(nsNames[ns]) for ns in nsNames) + self.log( + "processing ensembl gene names completed: %d identifiers\n" + % (numNames - numNames0) + ) + # if ensembl name header ok + + # process unigene gene names + self.log("processing unigene gene names ...\n") + with open(path + "/gene2unigene", "r") as ugFile: + header = ugFile.__next__().rstrip() + if not ( + header.startswith( + "#Format: GeneID UniGene_cluster" + ) # "(tab is used as a separator, pound sign - start of a comment)" + or header.startswith("#GeneID UniGene_cluster") + ): + self.log(" ERROR: unrecognized file header\n") + self.log("%s\n" % header) + else: + for line in ugFile: + words = line.rstrip().split("\t") + entrezID = int(words[0]) if words[0] != "-" else None + unigeneID = words[1] if words[1] != "-" else None + + while entrezID and (entrezID in entrezUpdate): + entrezID = entrezUpdate[entrezID] + + # there will be lots of extraneous mappings for genes of other species + if entrezID and (entrezID in entrezBID) and unigeneID: + nsNames["unigene_gid"].add((entrezBID[entrezID], unigeneID)) + # foreach line in ugFile + + # print stats + numNames0 = numNames + numNames = sum(len(nsNames[ns]) for ns in nsNames) + self.log( + "processing unigene gene names completed: %d identifiers\n" + % (numNames - numNames0) + ) + # if unigene name header ok + # with ugFile + + if True: + # process uniprot gene names from entrez + self.log("processing uniprot gene names ...\n") + upFile = self.zfile( + path + "/gene_refseq_uniprotkb_collab.gz" + ) # TODO:context manager,iterator + header = upFile.__next__().rstrip() + if not ( + header.startswith( + "#Format: NCBI_protein_accession UniProtKB_protein_accession" + ) # "(tab is used as a separator, pound sign - start of a comment)" + or header.startswith( + "#NCBI_protein_accession UniProtKB_protein_accession" + ) + ): + self.log(" ERROR: unrecognized file header\n") + self.log("%s\n" % header) + else: + for line in upFile: + words = line.split("\t") + proteinAcc = words[0].rsplit(".", 1)[0] if words[0] != "-" else None + uniprotAcc = words[1] if words[1] != "-" else None + + # there will be tons of identifiers missing from refseqBIDs because they're non-human + if proteinAcc and (proteinAcc in refseqBIDs) and uniprotAcc: + for biopolymerID in refseqBIDs[proteinAcc]: + nsNames["uniprot_pid"].add((biopolymerID, uniprotAcc)) + # foreach line in upFile + + # print stats + numNames0 = numNames + numNames = sum(len(nsNames[ns]) for ns in nsNames) + self.log( + "processing uniprot gene names completed: %d identifiers\n" + % (numNames - numNames0) + ) + # if header ok + else: + # process uniprot gene names from uniprot (no header!) + self.log("processing uniprot gene names ...\n") + upFile = self.zfile( + path + "/HUMAN_9606_idmapping_selected.tab.gz" + ) # TODO:context manager,iterator + """ /* ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/README */ 1. UniProtKB-AC 2. UniProtKB-ID 3. GeneID (EntrezGene) @@ -505,93 +621,143 @@ def update(self, options, path): 22. Ensembl_PRO 23. Additional PubMed """ - for line in upFile: - words = line.split("\t") - uniprotAcc = words[0] - uniprotID = words[1] - found = False - for word2 in words[2].split(';'): - entrezID = int(word2.strip()) if word2 else None - if entrezID and (entrezID in entrezBID): - nsNameNames['uniprot_pid'].add( (namespaceID['entrez_gid'],entrezID,uniprotAcc) ) - nsNameNames['uniprot_gid'].add( (namespaceID['entrez_gid'],entrezID,uniprotID) ) - found = True - #foreach entrezID mapping - if not found: - for word3 in words[3].split(';'): - refseqID = word3.strip().split('.',1)[0] if word3 else None - if refseqID: - nsNameNames['uniprot_pid'].add( (namespaceID['refseq_pid'],refseqID,uniprotAcc) ) - nsNameNames['uniprot_pid'].add( (namespaceID['refseq_gid'],refseqID,uniprotAcc) ) - nsNameNames['uniprot_gid'].add( (namespaceID['refseq_pid'],refseqID,uniprotID) ) - nsNameNames['uniprot_gid'].add( (namespaceID['refseq_gid'],refseqID,uniprotID) ) - #foreach refseq mapping - for word14 in words[14].split(';'): - mimID = word14.strip() if word14 else None - if mimID: - nsNameNames['uniprot_pid'].add( (namespaceID['mim_id'],mimID,uniprotAcc) ) - nsNameNames['uniprot_gid'].add( (namespaceID['mim_id'],mimID,uniprotID) ) - #foreach mim mapping - for word15 in words[15].split(';'): - unigeneID = word15.strip() if word15 else None - if unigeneID: - nsNameNames['uniprot_pid'].add( (namespaceID['unigene_gid'],unigeneID,uniprotAcc) ) - nsNameNames['uniprot_gid'].add( (namespaceID['unigene_gid'],unigeneID,uniprotID) ) - #foreach mim mapping - for word19 in words[19].split(';'): - ensemblGID = word19.strip() if word19 else None - if ensemblGID: - nsNameNames['uniprot_pid'].add( (namespaceID['ensembl_gid'],ensemblGID,uniprotAcc) ) - nsNameNames['uniprot_gid'].add( (namespaceID['ensembl_gid'],ensemblGID,uniprotID) ) - #foreach ensG mapping - for word20 in words[20].split(';'): - ensemblTID = word20.strip() if word20 else None - if ensemblTID: - nsNameNames['uniprot_pid'].add( (namespaceID['ensembl_gid'],ensemblTID,uniprotAcc) ) - nsNameNames['uniprot_gid'].add( (namespaceID['ensembl_gid'],ensemblTID,uniprotID) ) - #foreach ensT mapping - for word21 in words[21].split(';'): - ensemblPID = word21.strip() if word21 else None - if ensemblPID: - nsNameNames['uniprot_pid'].add( (namespaceID['ensembl_pid'],ensemblPID,uniprotAcc) ) - nsNameNames['uniprot_gid'].add( (namespaceID['ensembl_pid'],ensemblPID,uniprotID) ) - #foreach ensP mapping - #if no entrezID match - #foreach line in upFile - - # print stats - numNames0 = numNames - numNames = sum(len(nsNames[ns]) for ns in nsNames) - numNameNames0 = numNameNames - numNameNames = sum(len(set(n[2] for n in nsNameNames[ns])) for ns in nsNameNames) - numNameRefs0 = numNameRefs - numNameRefs = sum(len(nsNameNames[ns]) for ns in nsNameNames) - self.log("processing uniprot gene names completed: %d identifiers (%d references)\n" % (numNames-numNames0+numNameNames-numNameNames0,numNameRefs-numNameRefs0)) - #switch uniprot source - - # store gene names - self.log("writing gene identifiers to the database ...\n") - numNames = 0 - for ns in nsNames: - if nsNames[ns]: - numNames += len(nsNames[ns]) - self.addBiopolymerNamespacedNames(namespaceID[ns], nsNames[ns]) - self.log("writing gene identifiers to the database completed: %d identifiers\n" % (numNames,)) - nsNames = None - - # store gene names - numNameNames = sum(len(nsNameNames[ns]) for ns in nsNameNames) - if numNameNames: - self.log("writing gene identifier references to the database ...\n") - for ns in nsNameNames: - if nsNameNames[ns]: - self.addBiopolymerTypedNameNamespacedNames(typeID['gene'], namespaceID[ns], nsNameNames[ns]) - self.log("writing gene identifier references to the database completed: %d references\n" % (numNameNames,)) - nsNameNames = None - #if numNameNames - - # store source metadata - self.setSourceBuilds(grcBuild, None) - #update() - -#Source_entrez + for line in upFile: + words = line.split("\t") + uniprotAcc = words[0] + uniprotID = words[1] + found = False + for word2 in words[2].split(";"): + entrezID = int(word2.strip()) if word2 else None + if entrezID and (entrezID in entrezBID): + nsNameNames["uniprot_pid"].add( + (namespaceID["entrez_gid"], entrezID, uniprotAcc) + ) + nsNameNames["uniprot_gid"].add( + (namespaceID["entrez_gid"], entrezID, uniprotID) + ) + found = True + # foreach entrezID mapping + if not found: + for word3 in words[3].split(";"): + refseqID = word3.strip().split(".", 1)[0] if word3 else None + if refseqID: + nsNameNames["uniprot_pid"].add( + (namespaceID["refseq_pid"], refseqID, uniprotAcc) + ) + nsNameNames["uniprot_pid"].add( + (namespaceID["refseq_gid"], refseqID, uniprotAcc) + ) + nsNameNames["uniprot_gid"].add( + (namespaceID["refseq_pid"], refseqID, uniprotID) + ) + nsNameNames["uniprot_gid"].add( + (namespaceID["refseq_gid"], refseqID, uniprotID) + ) + # foreach refseq mapping + for word14 in words[14].split(";"): + mimID = word14.strip() if word14 else None + if mimID: + nsNameNames["uniprot_pid"].add( + (namespaceID["mim_id"], mimID, uniprotAcc) + ) + nsNameNames["uniprot_gid"].add( + (namespaceID["mim_id"], mimID, uniprotID) + ) + # foreach mim mapping + for word15 in words[15].split(";"): + unigeneID = word15.strip() if word15 else None + if unigeneID: + nsNameNames["uniprot_pid"].add( + (namespaceID["unigene_gid"], unigeneID, uniprotAcc) + ) + nsNameNames["uniprot_gid"].add( + (namespaceID["unigene_gid"], unigeneID, uniprotID) + ) + # foreach mim mapping + for word19 in words[19].split(";"): + ensemblGID = word19.strip() if word19 else None + if ensemblGID: + nsNameNames["uniprot_pid"].add( + (namespaceID["ensembl_gid"], ensemblGID, uniprotAcc) + ) + nsNameNames["uniprot_gid"].add( + (namespaceID["ensembl_gid"], ensemblGID, uniprotID) + ) + # foreach ensG mapping + for word20 in words[20].split(";"): + ensemblTID = word20.strip() if word20 else None + if ensemblTID: + nsNameNames["uniprot_pid"].add( + (namespaceID["ensembl_gid"], ensemblTID, uniprotAcc) + ) + nsNameNames["uniprot_gid"].add( + (namespaceID["ensembl_gid"], ensemblTID, uniprotID) + ) + # foreach ensT mapping + for word21 in words[21].split(";"): + ensemblPID = word21.strip() if word21 else None + if ensemblPID: + nsNameNames["uniprot_pid"].add( + (namespaceID["ensembl_pid"], ensemblPID, uniprotAcc) + ) + nsNameNames["uniprot_gid"].add( + (namespaceID["ensembl_pid"], ensemblPID, uniprotID) + ) + # foreach ensP mapping + # if no entrezID match + # foreach line in upFile + + # print stats + numNames0 = numNames + numNames = sum(len(nsNames[ns]) for ns in nsNames) + numNameNames0 = numNameNames + numNameNames = sum( + len(set(n[2] for n in nsNameNames[ns])) for ns in nsNameNames + ) + numNameRefs0 = numNameRefs + numNameRefs = sum(len(nsNameNames[ns]) for ns in nsNameNames) + self.log( + "processing uniprot gene names completed: %d identifiers (%d references)\n" + % ( + numNames - numNames0 + numNameNames - numNameNames0, + numNameRefs - numNameRefs0, + ) + ) + # switch uniprot source + + # store gene names + self.log("writing gene identifiers to the database ...\n") + numNames = 0 + for ns in nsNames: + if nsNames[ns]: + numNames += len(nsNames[ns]) + self.addBiopolymerNamespacedNames(namespaceID[ns], nsNames[ns]) + self.log( + "writing gene identifiers to the database completed: %d identifiers\n" + % (numNames,) + ) + nsNames = None + + # store gene names + numNameNames = sum(len(nsNameNames[ns]) for ns in nsNameNames) + if numNameNames: + self.log("writing gene identifier references to the database ...\n") + for ns in nsNameNames: + if nsNameNames[ns]: + self.addBiopolymerTypedNameNamespacedNames( + typeID["gene"], namespaceID[ns], nsNameNames[ns] + ) + self.log( + "writing gene identifier references to the database completed: %d references\n" + % (numNameNames,) + ) + nsNameNames = None + # if numNameNames + + # store source metadata + self.setSourceBuilds(grcBuild, None) + + # update() + + +# Source_entrez diff --git a/loki/loaders/loki_source_go.py b/loki/loaders/loki_source_go.py index e1678b5..392c3cf 100644 --- a/loki/loaders/loki_source_go.py +++ b/loki/loaders/loki_source_go.py @@ -6,216 +6,261 @@ class Source_go(loki_source.Source): - - - @classmethod - def getVersionString(cls): - return '2.1 (2022-04-14)' - #getVersionString() - - - def download(self, options, path): - # download the latest source files - self.downloadFilesFromHTTP('current.geneontology.org', { - path+'/goa_human.gaf.gz': '/annotations/goa_human.gaf.gz', - path+'/go.obo': '/ontology/go.obo', - }) - - return [ - path+'/goa_human.gaf.gz', - path+'/go.obo' - ] - #download() - - - def update(self, options, path): - # clear out all old data from this source - self.log("deleting old records from the database ...\n") - self.deleteAll() - self.log("deleting old records from the database completed\n") - - # get or create the required metadata records - namespaceID = self.addNamespaces([ - ('go_id', 0), - ('ontology', 0), - ('symbol', 0), - ('uniprot_pid', 1), - ]) - relationshipID = self.addRelationships([ - ('is_a',), - ]) - typeID = self.addTypes([ - ('ontology',), - ('gene',), - ]) - subtypeID = self.addSubtypes([ - ('-',), - ]) - - # process ontology terms - self.log("processing ontology terms ...\n") - # file format specification: http://www.geneontology.org/GO.format.obo-1_2.shtml - # correctly handling all the possible escape sequences and special cases - # in the OBO spec would be somewhat involved, but the previous version - # of biofilter used a much simpler approach which seemed to work okay in - # practice, so we'll stick with that for now - reTrailingEscape = re.compile('(?:^|[^\\\\])(?:\\\\\\\\)*\\\\$') - empty = tuple() - goName = {} - goDef = {} - goLinks = {} - #goNS = {} - #oboProps = {} - curStanza = curID = curAnon = curObs = curName = curNS = curDef = curLinks = None - with open(path+'/go.obo','r') as oboFile: - while True: - try: - line = next(oboFile).rstrip() - parts = line.split('!',1)[0].split(':',1) - tag = parts[0].strip() - val = parts[1].strip() if (len(parts) > 1) else None - except StopIteration: - line = False - - if line == False or tag.startswith('['): - if (curStanza == 'Term') and curID and (not curAnon) and (not curObs): - goName[curID] = curName - goDef[curID] = curDef - goLinks[curID] = curLinks or empty - # goNS[curID] = curNS or (oboProps['default-namespace'][-1] if ('default-namespace' in oboProps) else None) - if line == False: - break - curStanza = tag[1:tag.index(']')] - curID = curAnon = curObs = curName = curNS = curDef = curLinks = None - #elif not curStanza: - # # before the first stanza, tag-value pairs are global file properties - # if tag not in oboProps: - # oboProps[tag] = [] - # oboProps[tag].append(val) - elif tag == 'id': - curID = val - elif tag == 'alt_id': - pass - elif tag == 'def': - curDef = val - if val.startswith('"'): - curDef = '' - words = val.split('"') - for w in range(1,len(words)): - curDef += words[w] - if not reTrailingEscape.search(words[w]): - break - elif tag == 'is_anonymous': - curAnon = (val.lower().split()[0] == 'true') - elif tag == 'is_obsolete': - curObs = (val.lower().split()[0] == 'true') - elif tag == 'replaced_by': - pass - #elif tag == 'namespace': - # curNS = val - elif tag == 'name': - curName = val - elif tag == 'synonym': - pass - elif tag == 'xref': - pass - elif tag == 'is_a': - curLinks = curLinks or set() - curLinks.add( (val.split()[0], relationshipID['is_a'], -1) ) - elif tag == 'relationship': - curLinks = curLinks or set() - words = val.split() - if words[0] not in relationshipID: - relationshipID[words[0]] = self.addRelationship(words[0]) - if words[0] == 'part_of': - contains = -1 - elif words[0] in ('regulates','positively_regulates','negatively_regulates'): - contains = 0 - else: - contains = None - curLinks.add( (words[1], relationshipID[words[0]], contains) ) - #foreach line - #with oboFile - numTerms = len(goName) - numLinks = sum(len(goLinks[goID]) for goID in goLinks) - self.log("processing ontology terms completed: %d terms, %d links\n" % (numTerms,numLinks)) - - # store ontology terms - self.log("writing ontology terms to the database ...\n") - listGoID = goName.keys() - listGID = self.addTypedGroups(typeID['ontology'], ((subtypeID['-'], goName[goID],goDef[goID]) for goID in listGoID)) - goGID = dict(zip(listGoID,listGID)) - self.log("writing ontology terms to the database completed\n") - - # store ontology term names - self.log("writing ontology term names to the database ...\n") - self.addGroupNamespacedNames(namespaceID['go_id'], ((goGID[goID],goID) for goID in listGoID)) - self.addGroupNamespacedNames(namespaceID['ontology'], ((goGID[goID],goName[goID]) for goID in listGoID)) - self.log("writing ontology term names to the database completed\n") - - # store ontology term links - self.log("writing ontology term relationships to the database ...\n") - listLinks = [] - for goID in goLinks: - for link in (goLinks[goID] or empty): - if link[0] in goGID: - listLinks.append( (goGID[goID],goGID[link[0]],link[1],link[2]) ) - self.addGroupRelationships(listLinks) - self.log("writing ontology term relationships to the database completed\n") - - # process gene associations - self.log("processing gene associations ...\n") - if os.path.isfile(path+'/gene_association.goa_human.gz') and not os.path.isfile(path+'/goa_human.gaf.gz'): - assocFile = self.zfile(path+'/gene_association.goa_human.gz') #TODO:context manager,iterator - else: - assocFile = self.zfile(path+'/goa_human.gaf.gz') #TODO:context manager,iterator - nsAssoc = { - 'uniprot_pid': set(), - 'symbol': set() - } - numAssoc = numID = 0 - for line in assocFile: - words = line.split('\t') - if len(words) < 13: - continue - xrefDB = words[0] - xrefID = words[1] - gene = words[2] - #assocType = words[3] - goID = words[4] - #reference = words[5] - evidence = words[6] - #withID = words[7] - #goType = words[8] - #desc = words[9] - aliases = words[10].split('|') - #xrefType = words[11] - taxon = words[12] - #updated = words[13] - #assigner = words[14] - #extensions = words[15].split('|') - #xrefIDsplice = words[16] - - # TODO: find out for sure why the old Biofilter loader ignores IEA - if xrefDB == 'UniProtKB' and goID in goGID and evidence != 'IEA' and taxon == 'taxon:9606': - numAssoc += 1 - numID += 2 - nsAssoc['uniprot_pid'].add( (goGID[goID],numAssoc,xrefID) ) - nsAssoc['symbol'].add( (goGID[goID],numAssoc,gene) ) - for alias in aliases: - numID += 1 - # aliases might be either symbols or uniprot identifiers, so try them both ways - nsAssoc['uniprot_pid'].add( (goGID[goID],numAssoc,alias) ) - nsAssoc['symbol'].add( (goGID[goID],numAssoc,alias) ) - #if association is ok - #foreach association - self.log("processing gene associations completed: %d associations (%d identifiers)\n" % (numAssoc,numID)) - - # store gene associations - self.log("writing gene associations to the database ...\n") - for ns in nsAssoc: - self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID[ns], nsAssoc[ns]) - self.log("writing gene associations to the database completed\n") - #update() - -#Source_go + + @classmethod + def getVersionString(cls): + return "2.1 (2022-04-14)" + + # getVersionString() + + def download(self, options, path): + # download the latest source files + self.downloadFilesFromHTTP( + "current.geneontology.org", + { + path + "/goa_human.gaf.gz": "/annotations/goa_human.gaf.gz", + path + "/go.obo": "/ontology/go.obo", + }, + ) + + return [path + "/goa_human.gaf.gz", path + "/go.obo"] + + # download() + + def update(self, options, path): + # clear out all old data from this source + self.log("deleting old records from the database ...\n") + self.deleteAll() + self.log("deleting old records from the database completed\n") + + # get or create the required metadata records + namespaceID = self.addNamespaces( + [ + ("go_id", 0), + ("ontology", 0), + ("symbol", 0), + ("uniprot_pid", 1), + ] + ) + relationshipID = self.addRelationships( + [ + ("is_a",), + ] + ) + typeID = self.addTypes( + [ + ("ontology",), + ("gene",), + ] + ) + subtypeID = self.addSubtypes( + [ + ("-",), + ] + ) + + # process ontology terms + self.log("processing ontology terms ...\n") + # file format specification: http://www.geneontology.org/GO.format.obo-1_2.shtml + # correctly handling all the possible escape sequences and special cases + # in the OBO spec would be somewhat involved, but the previous version + # of biofilter used a much simpler approach which seemed to work okay in + # practice, so we'll stick with that for now + reTrailingEscape = re.compile("(?:^|[^\\\\])(?:\\\\\\\\)*\\\\$") + empty = tuple() + goName = {} + goDef = {} + goLinks = {} + # goNS = {} + # oboProps = {} + curStanza = curID = curAnon = curObs = curName = curNS = curDef = curLinks = ( + None + ) + with open(path + "/go.obo", "r") as oboFile: + while True: + try: + line = next(oboFile).rstrip() + parts = line.split("!", 1)[0].split(":", 1) + tag = parts[0].strip() + val = parts[1].strip() if (len(parts) > 1) else None + except StopIteration: + line = False + + if line == False or tag.startswith("["): + if ( + (curStanza == "Term") + and curID + and (not curAnon) + and (not curObs) + ): + goName[curID] = curName + goDef[curID] = curDef + goLinks[curID] = curLinks or empty + # goNS[curID] = curNS or (oboProps['default-namespace'][-1] if ('default-namespace' in oboProps) else None) + if line == False: + break + curStanza = tag[1 : tag.index("]")] + curID = curAnon = curObs = curName = curNS = curDef = curLinks = ( + None + ) + # elif not curStanza: + # # before the first stanza, tag-value pairs are global file properties + # if tag not in oboProps: + # oboProps[tag] = [] + # oboProps[tag].append(val) + elif tag == "id": + curID = val + elif tag == "alt_id": + pass + elif tag == "def": + curDef = val + if val.startswith('"'): + curDef = "" + words = val.split('"') + for w in range(1, len(words)): + curDef += words[w] + if not reTrailingEscape.search(words[w]): + break + elif tag == "is_anonymous": + curAnon = val.lower().split()[0] == "true" + elif tag == "is_obsolete": + curObs = val.lower().split()[0] == "true" + elif tag == "replaced_by": + pass + # elif tag == 'namespace': + # curNS = val + elif tag == "name": + curName = val + elif tag == "synonym": + pass + elif tag == "xref": + pass + elif tag == "is_a": + curLinks = curLinks or set() + curLinks.add((val.split()[0], relationshipID["is_a"], -1)) + elif tag == "relationship": + curLinks = curLinks or set() + words = val.split() + if words[0] not in relationshipID: + relationshipID[words[0]] = self.addRelationship(words[0]) + if words[0] == "part_of": + contains = -1 + elif words[0] in ( + "regulates", + "positively_regulates", + "negatively_regulates", + ): + contains = 0 + else: + contains = None + curLinks.add((words[1], relationshipID[words[0]], contains)) + # foreach line + # with oboFile + numTerms = len(goName) + numLinks = sum(len(goLinks[goID]) for goID in goLinks) + self.log( + "processing ontology terms completed: %d terms, %d links\n" + % (numTerms, numLinks) + ) + + # store ontology terms + self.log("writing ontology terms to the database ...\n") + listGoID = goName.keys() + listGID = self.addTypedGroups( + typeID["ontology"], + ((subtypeID["-"], goName[goID], goDef[goID]) for goID in listGoID), + ) + goGID = dict(zip(listGoID, listGID)) + self.log("writing ontology terms to the database completed\n") + + # store ontology term names + self.log("writing ontology term names to the database ...\n") + self.addGroupNamespacedNames( + namespaceID["go_id"], ((goGID[goID], goID) for goID in listGoID) + ) + self.addGroupNamespacedNames( + namespaceID["ontology"], ((goGID[goID], goName[goID]) for goID in listGoID) + ) + self.log("writing ontology term names to the database completed\n") + + # store ontology term links + self.log("writing ontology term relationships to the database ...\n") + listLinks = [] + for goID in goLinks: + for link in goLinks[goID] or empty: + if link[0] in goGID: + listLinks.append((goGID[goID], goGID[link[0]], link[1], link[2])) + self.addGroupRelationships(listLinks) + self.log("writing ontology term relationships to the database completed\n") + + # process gene associations + self.log("processing gene associations ...\n") + if os.path.isfile( + path + "/gene_association.goa_human.gz" + ) and not os.path.isfile(path + "/goa_human.gaf.gz"): + assocFile = self.zfile( + path + "/gene_association.goa_human.gz" + ) # TODO:context manager,iterator + else: + assocFile = self.zfile( + path + "/goa_human.gaf.gz" + ) # TODO:context manager,iterator + nsAssoc = {"uniprot_pid": set(), "symbol": set()} + numAssoc = numID = 0 + for line in assocFile: + words = line.split("\t") + if len(words) < 13: + continue + xrefDB = words[0] + xrefID = words[1] + gene = words[2] + # assocType = words[3] + goID = words[4] + # reference = words[5] + evidence = words[6] + # withID = words[7] + # goType = words[8] + # desc = words[9] + aliases = words[10].split("|") + # xrefType = words[11] + taxon = words[12] + # updated = words[13] + # assigner = words[14] + # extensions = words[15].split('|') + # xrefIDsplice = words[16] + + # TODO: find out for sure why the old Biofilter loader ignores IEA + if ( + xrefDB == "UniProtKB" + and goID in goGID + and evidence != "IEA" + and taxon == "taxon:9606" + ): + numAssoc += 1 + numID += 2 + nsAssoc["uniprot_pid"].add((goGID[goID], numAssoc, xrefID)) + nsAssoc["symbol"].add((goGID[goID], numAssoc, gene)) + for alias in aliases: + numID += 1 + # aliases might be either symbols or uniprot identifiers, so try them both ways + nsAssoc["uniprot_pid"].add((goGID[goID], numAssoc, alias)) + nsAssoc["symbol"].add((goGID[goID], numAssoc, alias)) + # if association is ok + # foreach association + self.log( + "processing gene associations completed: %d associations (%d identifiers)\n" + % (numAssoc, numID) + ) + + # store gene associations + self.log("writing gene associations to the database ...\n") + for ns in nsAssoc: + self.addGroupMemberTypedNamespacedNames( + typeID["gene"], namespaceID[ns], nsAssoc[ns] + ) + self.log("writing gene associations to the database completed\n") + + # update() + + +# Source_go diff --git a/loki/loaders/loki_source_gwas.py b/loki/loaders/loki_source_gwas.py index 589b5ac..d9420a7 100644 --- a/loki/loaders/loki_source_gwas.py +++ b/loki/loaders/loki_source_gwas.py @@ -6,150 +6,226 @@ class Source_gwas(loki_source.Source): - - - ################################################## - # source interface - - - @classmethod - def getVersionString(cls): - return '2.5 (2016-09-19)' - #getVersionString() - - - def download(self, options, path): - # download the latest source files - # self.downloadFilesFromHTTP('www.genome.gov', { - # 'gwascatalog.txt': '/admin/gwascatalog.txt', - # }) - self.downloadFilesFromHTTP('www.ebi.ac.uk', { - path+'/gwas_catalog_v1.0-associations.tsv' : '/gwas/api/search/downloads/full' - }, alwaysDownload=True) - return [ - path+'/gwas_catalog_v1.0-associations.tsv' - ] - #download() - - - def update(self, options, path): - # clear out all old data from this source - self.log("deleting old records from the database ...\n") - self.deleteAll() - self.log("deleting old records from the database completed\n") - - # process gwas cataog - # the catalog uses dbSNP positions from b132, which should already be 1-based - self.log("processing GWAS catalog annotations ...\n") - reRS = re.compile('rs([0-9]+)', re.I) - reChrPos = re.compile('(?:^|[^_])chr([0-9XYMT]+)[:_]([0-9]+)', re.I) - reSNP = re.compile('(?:^|[^_])(?:chr([0-9XYMT]+)[:_]([0-9]+)|rs([0-9]+))', re.I) - listNone = [None] - numInc = numInvalid = 0 - setGwas = set() - if os.path.exists(path+'/gwas_catalog_v1.0-associations.tsv'): - with open(path+'/gwas_catalog_v1.0-associations.tsv','r') as gwasFile: - header = next(gwasFile).rstrip() - cols = list(w.strip() for w in header.split("\t")) - try: - colPubmedID = cols.index("PUBMEDID") - colTrait = cols.index("DISEASE/TRAIT") - colChm = cols.index("CHR_ID") - colPos = cols.index("CHR_POS") - colAlleles = cols.index("STRONGEST SNP-RISK ALLELE") - colSNPs = cols.index("SNPS") - colRAF = cols.index("RISK ALLELE FREQUENCY") - colORBeta = cols.index("OR or BETA") - col95CI = cols.index("95% CI (TEXT)") - except ValueError as e: - self.log(" ERROR\n") - raise Exception("unrecognized file header: %s" % str(e)) - l = 1 - for line in gwasFile: - l += 1 - line = line.rstrip("\r\n") - words = list(w.strip() for w in line.split("\t")) - if len(words) <= col95CI: - # blank line at the end is normal - if (len(words) > 1) or words[0]: - numInc += 1 - continue - elif (' x ' in words[colPos]) or (' x ' in words[colSNPs]): - # GWAS interaction pairs are not yet supported in LOKI - numInvalid += 1 - continue - pubmedID = int(words[colPubmedID]) if words[colPubmedID] else None - trait = words[colTrait] - listChm = words[colChm].split(';') if words[colChm] else list() - listPos = words[colPos].split(';') if words[colPos] else list() - snps = words[colSNPs] if words[colAlleles].endswith('aplotype') else words[colAlleles] - listSNPs = reSNP.findall(snps) - riskAfreq = words[colRAF] - orBeta = words[colORBeta] - allele95ci = words[col95CI] - if (len(listChm) == len(listPos) == 0) and (len(listSNPs) > 0): - listChm = listPos = list(None for i in range(len(listSNPs))) - if (len(listChm) == len(listPos)) and (len(listChm) > 0) and (len(listSNPs) == 0): - listSNPs = list((None,None,None) for i in range(len(listChm))) - if len(listChm) == len(listPos) == len(listSNPs): - for i in range(len(listSNPs)): - rs = int(listSNPs[i][2]) if listSNPs[i][2] else None - chm = self._loki.chr_num.get(listChm[i]) or self._loki.chr_num.get(listSNPs[i][0]) - pos = int(listPos[i]) if listPos[i] else (int(listSNPs[i][1]) if listSNPs[i][1] else None) - setGwas.add( (rs,chm,pos,trait,snps,orBeta,allele95ci,riskAfreq,pubmedID) ) - elif len(listChm) == len(listPos): - for i in range(len(listChm)): - rs = None - chm = self._loki.chr_num.get(listChm[i]) - pos = int(listPos[i]) if listPos[i] else None - setGwas.add( (rs,chm,pos,trait,snps,orBeta,allele95ci,riskAfreq,pubmedID) ) - for i in range(len(listSNPs)): - rs = int(listSNPs[i][2]) if listSNPs[i][2] else None - chm = self._loki.chr_num.get(listSNPs[i][0]) - pos = int(listSNPs[i][1]) if listSNPs[i][1] else None - setGwas.add( (rs,chm,pos,trait,snps,orBeta,allele95ci,riskAfreq,pubmedID) ) - else: - numInvalid += 1 - #foreach line - #with gwasFile - else: - with open(path+'/gwascatalog.txt','r') as gwasFile: - header = next(gwasFile).rstrip() - if header.startswith("Date Added to Catalog\tPUBMEDID\tFirst Author\tDate\tJournal\tLink\tStudy\tDisease/Trait\tInitial Sample Size\tReplication Sample Size\tRegion\tChr_id\tChr_pos\tReported Gene(s)\tMapped_gene\tUpstream_gene_id\tDownstream_gene_id\tSnp_gene_ids\tUpstream_gene_distance\tDownstream_gene_distance\tStrongest SNP-Risk Allele\tSNPs\tMerged\tSnp_id_current\tContext\tIntergenic\tRisk Allele Frequency\tp-Value\tPvalue_mlog\tp-Value (text)\tOR or beta\t95% CI (text)\t"): # "Platform [SNPs passing QC]\tCNV" - pass - elif header.startswith("Date Added to Catalog\tPUBMEDID\tFirst Author\tDate\tJournal\tLink\tStudy\tDisease/Trait\tInitial Sample Description\tReplication Sample Description\tRegion\tChr_id\tChr_pos\tReported Gene(s)\tMapped_gene\tUpstream_gene_id\tDownstream_gene_id\tSnp_gene_ids\tUpstream_gene_distance\tDownstream_gene_distance\tStrongest SNP-Risk Allele\tSNPs\tMerged\tSnp_id_current\tContext\tIntergenic\tRisk Allele Frequency\tp-Value\tPvalue_mlog\tp-Value (text)\tOR or beta\t95% CI (text)\t"): # "Platform [SNPs passing QC]\tCNV" - pass - else: - self.log(" ERROR\n") - raise Exception("unrecognized file header") - for line in gwasFile: - line = line.rstrip("\r\n") - words = list(w.strip() for w in line.decode('latin-1').split("\t")) - if len(words) <= 31: - # blank line at the end is normal - if (len(words) > 1) or words[0]: - numInc += 1 - continue - chm = self._loki.chr_num[words[11]] if (words[11] in self._loki.chr_num) else None - pos = int(words[12]) if words[12] else None - trait = words[7] - snps = words[21] if words[20].endswith('aplotype') else words[20] - rses = list(int(rs[2:]) for rs in reRS.findall(snps)) or listNone - orBeta = words[30] - allele95ci = words[31] - riskAfreq = words[26] - pubmedID = int(words[1]) if words[1] else None - for rs in rses: - setGwas.add( (rs,chm,pos,trait,snps,orBeta,allele95ci,riskAfreq,pubmedID) ) - #foreach line - #with gwasFile - #if path - self.log("processing GWAS catalog annotations completed: %d entries (%d incomplete, %d invalid)\n" % (len(setGwas),numInc,numInvalid)) - if setGwas: - self.log("writing GWAS catalog annotations to the database ...\n") - self.addGWASAnnotations(setGwas) - self.log("writing GWAS catalog annotations to the database completed\n") - #update() - -#Source_gwas + ################################################## + # source interface + + @classmethod + def getVersionString(cls): + return "2.5 (2016-09-19)" + + # getVersionString() + + def download(self, options, path): + # download the latest source files + # self.downloadFilesFromHTTP('www.genome.gov', { + # 'gwascatalog.txt': '/admin/gwascatalog.txt', + # }) + self.downloadFilesFromHTTP( + "www.ebi.ac.uk", + { + path + + "/gwas_catalog_v1.0-associations.tsv": "/gwas/api/search/downloads/full" + }, + alwaysDownload=True, + ) + + return [path + "/gwas_catalog_v1.0-associations.tsv"] + + # download() + + def update(self, options, path): + # clear out all old data from this source + self.log("deleting old records from the database ...\n") + self.deleteAll() + self.log("deleting old records from the database completed\n") + + # process gwas cataog + # the catalog uses dbSNP positions from b132, which should already be 1-based + self.log("processing GWAS catalog annotations ...\n") + reRS = re.compile("rs([0-9]+)", re.I) + reChrPos = re.compile("(?:^|[^_])chr([0-9XYMT]+)[:_]([0-9]+)", re.I) + reSNP = re.compile("(?:^|[^_])(?:chr([0-9XYMT]+)[:_]([0-9]+)|rs([0-9]+))", re.I) + listNone = [None] + numInc = numInvalid = 0 + setGwas = set() + if os.path.exists(path + "/gwas_catalog_v1.0-associations.tsv"): + with open(path + "/gwas_catalog_v1.0-associations.tsv", "r") as gwasFile: + header = next(gwasFile).rstrip() + cols = list(w.strip() for w in header.split("\t")) + try: + colPubmedID = cols.index("PUBMEDID") + colTrait = cols.index("DISEASE/TRAIT") + colChm = cols.index("CHR_ID") + colPos = cols.index("CHR_POS") + colAlleles = cols.index("STRONGEST SNP-RISK ALLELE") + colSNPs = cols.index("SNPS") + colRAF = cols.index("RISK ALLELE FREQUENCY") + colORBeta = cols.index("OR or BETA") + col95CI = cols.index("95% CI (TEXT)") + except ValueError as e: + self.log(" ERROR\n") + raise Exception("unrecognized file header: %s" % str(e)) + l = 1 + for line in gwasFile: + l += 1 + line = line.rstrip("\r\n") + words = list(w.strip() for w in line.split("\t")) + if len(words) <= col95CI: + # blank line at the end is normal + if (len(words) > 1) or words[0]: + numInc += 1 + continue + elif (" x " in words[colPos]) or (" x " in words[colSNPs]): + # GWAS interaction pairs are not yet supported in LOKI + numInvalid += 1 + continue + pubmedID = int(words[colPubmedID]) if words[colPubmedID] else None + trait = words[colTrait] + listChm = words[colChm].split(";") if words[colChm] else list() + listPos = words[colPos].split(";") if words[colPos] else list() + snps = ( + words[colSNPs] + if words[colAlleles].endswith("aplotype") + else words[colAlleles] + ) + listSNPs = reSNP.findall(snps) + riskAfreq = words[colRAF] + orBeta = words[colORBeta] + allele95ci = words[col95CI] + if (len(listChm) == len(listPos) == 0) and (len(listSNPs) > 0): + listChm = listPos = list(None for i in range(len(listSNPs))) + if ( + (len(listChm) == len(listPos)) + and (len(listChm) > 0) + and (len(listSNPs) == 0) + ): + listSNPs = list((None, None, None) for i in range(len(listChm))) + if len(listChm) == len(listPos) == len(listSNPs): + for i in range(len(listSNPs)): + rs = int(listSNPs[i][2]) if listSNPs[i][2] else None + chm = self._loki.chr_num.get( + listChm[i] + ) or self._loki.chr_num.get(listSNPs[i][0]) + pos = ( + int(listPos[i]) + if listPos[i] + else (int(listSNPs[i][1]) if listSNPs[i][1] else None) + ) + setGwas.add( + ( + rs, + chm, + pos, + trait, + snps, + orBeta, + allele95ci, + riskAfreq, + pubmedID, + ) + ) + elif len(listChm) == len(listPos): + for i in range(len(listChm)): + rs = None + chm = self._loki.chr_num.get(listChm[i]) + pos = int(listPos[i]) if listPos[i] else None + setGwas.add( + ( + rs, + chm, + pos, + trait, + snps, + orBeta, + allele95ci, + riskAfreq, + pubmedID, + ) + ) + for i in range(len(listSNPs)): + rs = int(listSNPs[i][2]) if listSNPs[i][2] else None + chm = self._loki.chr_num.get(listSNPs[i][0]) + pos = int(listSNPs[i][1]) if listSNPs[i][1] else None + setGwas.add( + ( + rs, + chm, + pos, + trait, + snps, + orBeta, + allele95ci, + riskAfreq, + pubmedID, + ) + ) + else: + numInvalid += 1 + # foreach line + # with gwasFile + else: + with open(path + "/gwascatalog.txt", "r") as gwasFile: + header = next(gwasFile).rstrip() + if header.startswith( + "Date Added to Catalog\tPUBMEDID\tFirst Author\tDate\tJournal\tLink\tStudy\tDisease/Trait\tInitial Sample Size\tReplication Sample Size\tRegion\tChr_id\tChr_pos\tReported Gene(s)\tMapped_gene\tUpstream_gene_id\tDownstream_gene_id\tSnp_gene_ids\tUpstream_gene_distance\tDownstream_gene_distance\tStrongest SNP-Risk Allele\tSNPs\tMerged\tSnp_id_current\tContext\tIntergenic\tRisk Allele Frequency\tp-Value\tPvalue_mlog\tp-Value (text)\tOR or beta\t95% CI (text)\t" + ): # "Platform [SNPs passing QC]\tCNV" + pass + elif header.startswith( + "Date Added to Catalog\tPUBMEDID\tFirst Author\tDate\tJournal\tLink\tStudy\tDisease/Trait\tInitial Sample Description\tReplication Sample Description\tRegion\tChr_id\tChr_pos\tReported Gene(s)\tMapped_gene\tUpstream_gene_id\tDownstream_gene_id\tSnp_gene_ids\tUpstream_gene_distance\tDownstream_gene_distance\tStrongest SNP-Risk Allele\tSNPs\tMerged\tSnp_id_current\tContext\tIntergenic\tRisk Allele Frequency\tp-Value\tPvalue_mlog\tp-Value (text)\tOR or beta\t95% CI (text)\t" + ): # "Platform [SNPs passing QC]\tCNV" + pass + else: + self.log(" ERROR\n") + raise Exception("unrecognized file header") + for line in gwasFile: + line = line.rstrip("\r\n") + words = list(w.strip() for w in line.decode("latin-1").split("\t")) + if len(words) <= 31: + # blank line at the end is normal + if (len(words) > 1) or words[0]: + numInc += 1 + continue + chm = ( + self._loki.chr_num[words[11]] + if (words[11] in self._loki.chr_num) + else None + ) + pos = int(words[12]) if words[12] else None + trait = words[7] + snps = words[21] if words[20].endswith("aplotype") else words[20] + rses = list(int(rs[2:]) for rs in reRS.findall(snps)) or listNone + orBeta = words[30] + allele95ci = words[31] + riskAfreq = words[26] + pubmedID = int(words[1]) if words[1] else None + for rs in rses: + setGwas.add( + ( + rs, + chm, + pos, + trait, + snps, + orBeta, + allele95ci, + riskAfreq, + pubmedID, + ) + ) + # foreach line + # with gwasFile + # if path + self.log( + "processing GWAS catalog annotations completed: %d entries (%d incomplete, %d invalid)\n" + % (len(setGwas), numInc, numInvalid) + ) + if setGwas: + self.log("writing GWAS catalog annotations to the database ...\n") + self.addGWASAnnotations(setGwas) + self.log("writing GWAS catalog annotations to the database completed\n") + + # update() + + +# Source_gwas diff --git a/loki/loaders/loki_source_mint.py b/loki/loaders/loki_source_mint.py index 1312eca..8ba0e7b 100644 --- a/loki/loaders/loki_source_mint.py +++ b/loki/loaders/loki_source_mint.py @@ -7,232 +7,296 @@ class Source_mint(loki_source.Source): - - - ################################################## - # private class methods - - - def _identifyLatestFilename(self, filenames): - reFile = re.compile('^([0-9]+)-([0-9]+)-([0-9]+)-mint-human.txt$', re.IGNORECASE) - bestdate = datetime.date.min - bestfile = None - for filename in filenames: - match = reFile.match(filename) - if match: - filedate = datetime.date(int(match.group(1)), int(match.group(2)), int(match.group(3))) - if filedate > bestdate: - bestdate = filedate - bestfile = filename - #foreach filename - return bestfile - #_identifyLatestFilename() - - - ################################################## - # source interface - - - @classmethod - def getVersionString(cls): - return '2.2 (2018-02-20)' - #getVersionString() - - - def download(self, options, path): - #self.downloadFilesFromHTTP('mint.bio.uniroma2.it', { - # 'MINT_MiTab.txt': '/mitab/MINT_MiTab.txt', - #}) - self.downloadFilesFromHTTP('www.ebi.ac.uk', { - path+'/MINT_MiTab.txt': '/Tools/webservices/psicquic/mint/webservices/current/search/query/species:human', - }) - - return [ path+'/MINT_MiTab.txt' ] - #download() - - - def update(self, options, path): - # clear out all old data from this source - self.log("deleting old records from the database ...\n") - self.deleteAll() - self.log("deleting old records from the database completed\n") - - # get or create the required metadata records - namespaceID = self.addNamespaces([ - ('mint_id', 0), - ('symbol', 0), - ('entrez_gid', 0), - ('ensembl_gid', 0), - ('ensembl_pid', 1), - ('refseq_gid', 0), - ('refseq_pid', 1), - ('uniprot_pid', 1), - ]) - typeID = self.addTypes([ - ('interaction',), - ('gene',), - ]) - subtypeID = self.addSubtypes([ - ('-',), - ]) - - # process interation groups - self.log("processing interaction groups ...\n") - mintDesc = dict() - nsAssoc = { - 'symbol': set(), - 'entrez_gid': set(), - 'ensembl_gid': set(), - 'ensembl_pid': set(), - 'refseq_gid': set(), - 'refseq_pid': set(), - 'uniprot_pid': set(), - } - numAssoc = numID = 0 - if os.path.exists(path+'/MINT_MiTab.txt'): - with open(path+'/MINT_MiTab.txt','r') as assocFile: - l = 0 - for line in assocFile: - l += 1 - words = line.split('\t') - - # skip non-human records - if not (words[9].startswith('taxid:9606(') and words[10].startswith('taxid:9606(')): - continue - - # extract relevant columns - geneA = [w.strip() for w in words[0].split('|') if w != '-'] # id A - geneB = [w.strip() for w in words[1].split('|') if w != '-'] # id B - geneA.extend(w.strip() for w in words[2].split('|') if w != '-') # alt id A - geneB.extend(w.strip() for w in words[3].split('|') if w != '-') # alt id B - geneA.extend(w.strip() for w in words[4].split('|') if w != '-') # alias A - geneB.extend(w.strip() for w in words[5].split('|') if w != '-') # alias B - labels = dict( (w.strip().split(':',1) for w in words[13].split('|') if w != '-') ) - if len(words) > 23: - geneA.extend(w.strip() for w in words[22].split('|') if w != '-') # xref A - geneB.extend(w.strip() for w in words[23].split('|') if w != '-') # xref B - - # choose the group identifier - mintID = labels.get('mint') or labels.get('intact') or ('MINT-unlabeled-%d' % (l,)) - mintDesc[mintID] = '' - - for names in (geneA,geneB): - numAssoc += 1 - for name in names: - if ':' not in name: - continue - numID += 1 - prefix,name = name.split(':',1) - suffix = '' - if name.endswith(')'): - name,suffix = name[:-1].rsplit('(',1) - if name.startswith('"'): - name = name.split('"')[1] - - if prefix == 'entrezgene/locuslink': - nsAssoc['entrez_gid'].add( (mintID,numAssoc,name) ) - elif prefix == 'ensembl': - namespace = 'ensembl_pid' if name.startswith('ENSP') else 'ensembl_gid' - nsAssoc[namespace].add( (mintID,numAssoc,name) ) - elif prefix == 'refseq': - name = name.rsplit('.',1)[0] - name = name.rsplit(',',1)[0] - nsAssoc['refseq_gid'].add( (mintID,numAssoc,name) ) - nsAssoc['refseq_pid'].add( (mintID,numAssoc,name) ) - elif prefix == 'uniprotkb': - if (suffix == '(gene name)') or (suffix == '(gene name synonym)'): - namespace = 'symbol' - else: - namespace = 'uniprot_pid' - name = name.rsplit('-',1)[0] - nsAssoc[namespace].add( (mintID,numAssoc,name) ) - else: - numID -= 1 - #if prefix/suffix - #foreach name - #foreach interactor - #foreach line in assocFile - #with assocFile - else: # old FTP file - with open(self._identifyLatestFilename(os.listdir(path)),'r') as assocFile: - header = assocFile.next().rstrip() - if not header.startswith("ID interactors A (baits)\tID interactors B (preys)\tAlt. ID interactors A (baits)\tAlt. ID interactors B (preys)\tAlias(es) interactors A (baits)\tAlias(es) interactors B (preys)\tInteraction detection method(s)\tPublication 1st author(s)\tPublication Identifier(s)\tTaxid interactors A (baits)\tTaxid interactors B (preys)\tInteraction type(s)\tSource database(s)\tInteraction identifier(s)\t"): #Confidence value(s)\texpansion\tbiological roles A (baits)\tbiological role B\texperimental roles A (baits)\texperimental roles B (preys)\tinteractor types A (baits)\tinteractor types B (preys)\txrefs A (baits)\txrefs B (preys)\txrefs Interaction\tAnnotations A (baits)\tAnnotations B (preys)\tInteraction Annotations\tHost organism taxid\tparameters Interaction\tdataset\tCaution Interaction\tbinding sites A (baits)\tbinding sites B (preys)\tptms A (baits)\tptms B (preys)\tmutations A (baits)\tmutations B (preys)\tnegative\tinference\tcuration depth": - self.log(" ERROR\n") - self.log("unrecognized file header: %s\n" % header) - return False - xrefNS = { - 'entrezgene/locuslink': ('entrez_gid',), - 'refseq': ('refseq_gid','refseq_pid'), - 'uniprotkb': ('uniprot_pid',), - } - l = 0 - for line in assocFile: - l += 1 - words = line.split('\t') - genes = words[0].split(';') - genes.extend(words[1].split(';')) - aliases = words[4].split(';') - aliases.extend(words[5].split(';')) - method = words[6] - taxes = words[9].split(';') - taxes.extend(words[10].split(';')) - labels = words[13].split('|') - - # identify interaction group label - mint = None - for label in labels: - if label.startswith('mint:'): - mint = label - break - mint = mint or "MINT-unlabeled-%d" % l - mintDesc[mint] = method - - # identify interacting genes/proteins - for n in range(0,len(taxes)): - if taxes[n] == "taxid:9606(Homo sapiens)": - numAssoc += 1 - # the "gene" is a helpful database cross-reference with a label indicating its type - xrefDB,xrefID = genes[n].split(':',1) - if xrefDB in xrefNS: - numID += 1 - if xrefDB == 'refseq': - xrefID = xrefID.rsplit('.',1)[0] - elif xrefDB == 'uniprotkb': - xrefID = xrefID.rsplit('-',1)[0] - for ns in xrefNS[xrefDB]: - nsAssoc[ns].add( (mint,numAssoc,xrefID) ) - # but the "alias" could be of any type and isn't identified, - # so we'll store copies under each possible type - # and find out later which one matches something - numID += 1 - nsAssoc['symbol'].add( (mint,numAssoc,aliases[n]) ) - nsAssoc['refseq_gid'].add( (mint,numAssoc,aliases[n].rsplit('.',1)[0]) ) - nsAssoc['refseq_pid'].add( (mint,numAssoc,aliases[n].rsplit('.',1)[0]) ) - nsAssoc['uniprot_pid'].add( (mint,numAssoc,aliases[n].rsplit('-',1)[0]) ) - #if human - #foreach interacting gene/protein - #foreach line in assocFile - #with assocFile - #if new/old file - self.log("processing interaction groups completed: %d groups, %d associations (%d identifiers)\n" % (len(mintDesc),numAssoc,numID)) - - # store interaction groups - self.log("writing interaction groups to the database ...\n") - listMint = mintDesc.keys() - listGID = self.addTypedGroups(typeID['interaction'], ((subtypeID['-'], mint,mintDesc[mint]) for mint in listMint)) - mintGID = dict(zip(listMint,listGID)) - self.log("writing interaction groups to the database completed\n") - - # store interaction group names - self.log("writing interaction group names to the database ...\n") - self.addGroupNamespacedNames(namespaceID['mint_id'], ((mintGID[mint],mint) for mint in listMint)) - self.log("writing interaction group names to the database completed\n") - - # store gene interactions - self.log("writing gene interactions to the database ...\n") - for ns in nsAssoc: - self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID[ns], ((mintGID[a[0]],a[1],a[2]) for a in nsAssoc[ns])) - self.log("writing gene interactions to the database completed\n") - #update() - -#Source_mint + + ################################################## + # private class methods + + def _identifyLatestFilename(self, filenames): + reFile = re.compile( + "^([0-9]+)-([0-9]+)-([0-9]+)-mint-human.txt$", re.IGNORECASE + ) + bestdate = datetime.date.min + bestfile = None + for filename in filenames: + match = reFile.match(filename) + if match: + filedate = datetime.date( + int(match.group(1)), int(match.group(2)), int(match.group(3)) + ) + if filedate > bestdate: + bestdate = filedate + bestfile = filename + # foreach filename + return bestfile + + # _identifyLatestFilename() + + ################################################## + # source interface + + @classmethod + def getVersionString(cls): + return "2.2 (2018-02-20)" + + # getVersionString() + + def download(self, options, path): + # self.downloadFilesFromHTTP('mint.bio.uniroma2.it', { + # 'MINT_MiTab.txt': '/mitab/MINT_MiTab.txt', + # }) + self.downloadFilesFromHTTP( + "www.ebi.ac.uk", + { + path + + "/MINT_MiTab.txt": "/Tools/webservices/psicquic/mint/webservices/current/search/query/species:human", + }, + ) + + return [path + "/MINT_MiTab.txt"] + + # download() + + def update(self, options, path): + # clear out all old data from this source + self.log("deleting old records from the database ...\n") + self.deleteAll() + self.log("deleting old records from the database completed\n") + + # get or create the required metadata records + namespaceID = self.addNamespaces( + [ + ("mint_id", 0), + ("symbol", 0), + ("entrez_gid", 0), + ("ensembl_gid", 0), + ("ensembl_pid", 1), + ("refseq_gid", 0), + ("refseq_pid", 1), + ("uniprot_pid", 1), + ] + ) + typeID = self.addTypes( + [ + ("interaction",), + ("gene",), + ] + ) + subtypeID = self.addSubtypes( + [ + ("-",), + ] + ) + + # process interation groups + self.log("processing interaction groups ...\n") + mintDesc = dict() + nsAssoc = { + "symbol": set(), + "entrez_gid": set(), + "ensembl_gid": set(), + "ensembl_pid": set(), + "refseq_gid": set(), + "refseq_pid": set(), + "uniprot_pid": set(), + } + numAssoc = numID = 0 + if os.path.exists(path + "/MINT_MiTab.txt"): + with open(path + "/MINT_MiTab.txt", "r") as assocFile: + l = 0 + for line in assocFile: + l += 1 + words = line.split("\t") + + # skip non-human records + if not ( + words[9].startswith("taxid:9606(") + and words[10].startswith("taxid:9606(") + ): + continue + + # extract relevant columns + geneA = [w.strip() for w in words[0].split("|") if w != "-"] # id A + geneB = [w.strip() for w in words[1].split("|") if w != "-"] # id B + geneA.extend( + w.strip() for w in words[2].split("|") if w != "-" + ) # alt id A + geneB.extend( + w.strip() for w in words[3].split("|") if w != "-" + ) # alt id B + geneA.extend( + w.strip() for w in words[4].split("|") if w != "-" + ) # alias A + geneB.extend( + w.strip() for w in words[5].split("|") if w != "-" + ) # alias B + labels = dict( + ( + w.strip().split(":", 1) + for w in words[13].split("|") + if w != "-" + ) + ) + if len(words) > 23: + geneA.extend( + w.strip() for w in words[22].split("|") if w != "-" + ) # xref A + geneB.extend( + w.strip() for w in words[23].split("|") if w != "-" + ) # xref B + + # choose the group identifier + mintID = ( + labels.get("mint") + or labels.get("intact") + or ("MINT-unlabeled-%d" % (l,)) + ) + mintDesc[mintID] = "" + + for names in (geneA, geneB): + numAssoc += 1 + for name in names: + if ":" not in name: + continue + numID += 1 + prefix, name = name.split(":", 1) + suffix = "" + if name.endswith(")"): + name, suffix = name[:-1].rsplit("(", 1) + if name.startswith('"'): + name = name.split('"')[1] + + if prefix == "entrezgene/locuslink": + nsAssoc["entrez_gid"].add((mintID, numAssoc, name)) + elif prefix == "ensembl": + namespace = ( + "ensembl_pid" + if name.startswith("ENSP") + else "ensembl_gid" + ) + nsAssoc[namespace].add((mintID, numAssoc, name)) + elif prefix == "refseq": + name = name.rsplit(".", 1)[0] + name = name.rsplit(",", 1)[0] + nsAssoc["refseq_gid"].add((mintID, numAssoc, name)) + nsAssoc["refseq_pid"].add((mintID, numAssoc, name)) + elif prefix == "uniprotkb": + if (suffix == "(gene name)") or ( + suffix == "(gene name synonym)" + ): + namespace = "symbol" + else: + namespace = "uniprot_pid" + name = name.rsplit("-", 1)[0] + nsAssoc[namespace].add((mintID, numAssoc, name)) + else: + numID -= 1 + # if prefix/suffix + # foreach name + # foreach interactor + # foreach line in assocFile + # with assocFile + else: # old FTP file + with open(self._identifyLatestFilename(os.listdir(path)), "r") as assocFile: + header = assocFile.next().rstrip() + if not header.startswith( + "ID interactors A (baits)\tID interactors B (preys)\tAlt. ID interactors A (baits)\tAlt. ID interactors B (preys)\tAlias(es) interactors A (baits)\tAlias(es) interactors B (preys)\tInteraction detection method(s)\tPublication 1st author(s)\tPublication Identifier(s)\tTaxid interactors A (baits)\tTaxid interactors B (preys)\tInteraction type(s)\tSource database(s)\tInteraction identifier(s)\t" + ): # Confidence value(s)\texpansion\tbiological roles A (baits)\tbiological role B\texperimental roles A (baits)\texperimental roles B (preys)\tinteractor types A (baits)\tinteractor types B (preys)\txrefs A (baits)\txrefs B (preys)\txrefs Interaction\tAnnotations A (baits)\tAnnotations B (preys)\tInteraction Annotations\tHost organism taxid\tparameters Interaction\tdataset\tCaution Interaction\tbinding sites A (baits)\tbinding sites B (preys)\tptms A (baits)\tptms B (preys)\tmutations A (baits)\tmutations B (preys)\tnegative\tinference\tcuration depth": + self.log(" ERROR\n") + self.log("unrecognized file header: %s\n" % header) + return False + xrefNS = { + "entrezgene/locuslink": ("entrez_gid",), + "refseq": ("refseq_gid", "refseq_pid"), + "uniprotkb": ("uniprot_pid",), + } + l = 0 + for line in assocFile: + l += 1 + words = line.split("\t") + genes = words[0].split(";") + genes.extend(words[1].split(";")) + aliases = words[4].split(";") + aliases.extend(words[5].split(";")) + method = words[6] + taxes = words[9].split(";") + taxes.extend(words[10].split(";")) + labels = words[13].split("|") + + # identify interaction group label + mint = None + for label in labels: + if label.startswith("mint:"): + mint = label + break + mint = mint or "MINT-unlabeled-%d" % l + mintDesc[mint] = method + + # identify interacting genes/proteins + for n in range(0, len(taxes)): + if taxes[n] == "taxid:9606(Homo sapiens)": + numAssoc += 1 + # the "gene" is a helpful database cross-reference with a label indicating its type + xrefDB, xrefID = genes[n].split(":", 1) + if xrefDB in xrefNS: + numID += 1 + if xrefDB == "refseq": + xrefID = xrefID.rsplit(".", 1)[0] + elif xrefDB == "uniprotkb": + xrefID = xrefID.rsplit("-", 1)[0] + for ns in xrefNS[xrefDB]: + nsAssoc[ns].add((mint, numAssoc, xrefID)) + # but the "alias" could be of any type and isn't identified, + # so we'll store copies under each possible type + # and find out later which one matches something + numID += 1 + nsAssoc["symbol"].add((mint, numAssoc, aliases[n])) + nsAssoc["refseq_gid"].add( + (mint, numAssoc, aliases[n].rsplit(".", 1)[0]) + ) + nsAssoc["refseq_pid"].add( + (mint, numAssoc, aliases[n].rsplit(".", 1)[0]) + ) + nsAssoc["uniprot_pid"].add( + (mint, numAssoc, aliases[n].rsplit("-", 1)[0]) + ) + # if human + # foreach interacting gene/protein + # foreach line in assocFile + # with assocFile + # if new/old file + self.log( + "processing interaction groups completed: %d groups, %d associations (%d identifiers)\n" + % (len(mintDesc), numAssoc, numID) + ) + + # store interaction groups + self.log("writing interaction groups to the database ...\n") + listMint = mintDesc.keys() + listGID = self.addTypedGroups( + typeID["interaction"], + ((subtypeID["-"], mint, mintDesc[mint]) for mint in listMint), + ) + mintGID = dict(zip(listMint, listGID)) + self.log("writing interaction groups to the database completed\n") + + # store interaction group names + self.log("writing interaction group names to the database ...\n") + self.addGroupNamespacedNames( + namespaceID["mint_id"], ((mintGID[mint], mint) for mint in listMint) + ) + self.log("writing interaction group names to the database completed\n") + + # store gene interactions + self.log("writing gene interactions to the database ...\n") + for ns in nsAssoc: + self.addGroupMemberTypedNamespacedNames( + typeID["gene"], + namespaceID[ns], + ((mintGID[a[0]], a[1], a[2]) for a in nsAssoc[ns]), + ) + self.log("writing gene interactions to the database completed\n") + + # update() + + +# Source_mint diff --git a/loki/loaders/loki_source_oreganno.py b/loki/loaders/loki_source_oreganno.py index d9604cc..5bb261e 100644 --- a/loki/loaders/loki_source_oreganno.py +++ b/loki/loaders/loki_source_oreganno.py @@ -5,198 +5,240 @@ class Source_oreganno(loki_source.Source): - - _remHost = "hgdownload.cse.ucsc.edu" - _remPath = "/goldenPath/hg19/database/" - - _remFiles = ["oreganno.txt.gz","oregannoAttr.txt.gz", "oregannoLink.txt.gz"] - - - @classmethod - def getVersionString(cls): - return '2.1 (2016-09-19)' - #getVersionString() - - - def download(self, options, path): - """ - Download OregAnno from UCSC - """ -# self.downloadFilesFromFTP(self._remHost, dict(((f, self._remPath + f) for f in self._remFiles))) - self.downloadFilesFromHTTP(self._remHost, dict(((path+'/'+f, self._remPath + f) for f in self._remFiles))) - - return [(path+'/'+f) for f in self._remFiles] - - def update(self, options, path): - """ - Update the database with the OregAnno data from ucsc - """ - - self.log("deleting old records from the database ...\n") - self.deleteAll() - self.log("deleting old records from the database completed\n") - - # Add the 'oreganno' namespace - ns = self.addNamespace('oreganno') - - # Add the ensembl and entrez namespaces - external_ns = self.addNamespaces([ - ('symbol', 0), - ('entrez_gid', 0), - ('ensembl_gid', 0), - ]) - - # Add the types of Regions - typeids = self.addTypes([ - ('regulatory_region',), - ('tfbs',), - ('gene',) - ]) - subtypeID = self.addSubtypes([ - ('-',), - ]) - - # Add the types of groups - group_typeid = self.addType('regulatory_group') - - # Add the role for regulatory - snp_roleid = self.addRole("regulatory", "OregAnno Regulatory Polymorphism", 1, 0) - - # Get the default population ID - ldprofile_id = self.addLDProfile('', 'no LD adjustment') - - # build dict of gene id->oreganno id and a dict of - # oreganno id->entrez id and oreganno id->ensembl id - oreg_gene = {} - oreg_tfbs = {} - oreg_snp = {} - link_f = self.zfile(path+'/oregannoLink.txt.gz') - entrez_ns = external_ns['entrez_gid'] - ensembl_ns = external_ns['ensembl_gid'] - symbol_ns = external_ns['symbol'] - self.log("parsing external links ...\n") - for l in link_f: - fields = l.split() - if fields[1] == "Gene": - oreg_id = fields[0] - if fields[2] in ("EnsemblGene","EnsemblId"): - gene_id = fields[3].split(',')[-1] # used to be "Homo_sapiens,ENSG123" but now just "ENSG123" - oreg_gene.setdefault(oreg_id,{})[ensembl_ns] = gene_id - elif fields[2] in ("EntrezGene","NCBIGene"): - gene_id = fields[3] - oreg_gene.setdefault(oreg_id,{})[entrez_ns] = gene_id - elif fields[1] == "TFbs": - oreg_id = fields[0] - if fields[2] in ("EnsemblGene","EnsemblId"): - gene_id = fields[3].split(',')[-1] # used to be "Homo_sapiens,ENSG123" but now just "ENSG123" - oreg_tfbs.setdefault(oreg_id,{})[ensembl_ns] = gene_id - elif fields[2] in ("EntrezGene","NCBIGene"): - gene_id = fields[3] - oreg_tfbs.setdefault(oreg_id,{})[entrez_ns] = gene_id - elif fields[1] == "ExtLink" and fields[2] == "dbSNP": - # Just store the RS# (no leading "rs") - oreg_snp[fields[0]] = fields[3][2:] - #for l - self.log("parsing external links completed: %d genes, %d TFBs, %d SNPs\n" % (len(oreg_gene),len(oreg_tfbs),len(oreg_snp))) - - # Now, create a dict of oreganno id->type - oreganno_type = {} - self.log("parsing region attributes ...\n") - attr_f = self.zfile(path+"/oregannoAttr.txt.gz") - for l in attr_f: - fields = l.split('\t') - if fields[1] == "type": - oreganno_type[fields[0]] = fields[2] - elif fields[1] == "Gene": - oreg_gene.setdefault(fields[0],{})[symbol_ns] = fields[2] - elif fields[1] == "TFbs": - oreg_tfbs.setdefault(fields[0],{})[symbol_ns] = fields[2] - #for l - self.log("parsing region attributes completed: %d genes, %d TFBs\n" % (len(oreg_gene),len(oreg_tfbs))) - - # OK, now parse the actual regions themselves - region_f = self.zfile(path+'/oreganno.txt.gz') - oreganno_roles = [] - oreganno_regions = [] - oreganno_bounds = [] - oreganno_groups = {} - oreganno_types = {} - self.log("parsing regulatory regions ...\n") - snps_unmapped = 0 - for l in region_f: - fields = l.split() - chrom = self._loki.chr_num.get(fields[1][3:]) - start = int(fields[2]) + 1 - stop = int(fields[3]) - oreg_id = fields[4] - oreg_type = oreganno_type[oreg_id].upper() # used to be CAPS, now Title Case - if chrom and oreg_type == "REGULATORY POLYMORPHISM": - entrez_id = oreg_gene.get(oreg_id,{}).get(entrez_ns) - rsid = oreg_snp.get(oreg_id) - if entrez_id and rsid: - oreganno_roles.append((int(rsid), entrez_id, snp_roleid)) - else: - snps_unmapped+=1 - elif chrom and (oreg_type == "REGULATORY REGION" or oreg_type == "TRANSCRIPTION FACTOR BINDING SITE"): - gene_symbol = oreg_gene.get(oreg_id,{}).get(symbol_ns) - if not gene_symbol: - gene_symbol = oreg_tfbs.get(oreg_id,{}).get(symbol_ns) - - if gene_symbol: - oreganno_groups.setdefault(gene_symbol, []).append(oreg_id) - - if oreg_type == "REGULATORY REGION": - oreg_typeid = typeids['regulatory_region'] - else: - oreg_typeid = typeids['tfbs'] - - oreganno_types[oreg_id] = oreg_typeid - oreganno_regions.append((oreg_typeid, oreg_id, '')) - oreganno_bounds.append((chrom, start, stop)) - #if chrom and oreg_type - #for l - self.log("parsing regulatory regions completed (%d regions found, %d SNPs found, %d SNPs unmapped)\n" % (len(oreganno_regions), len(oreganno_roles), snps_unmapped)) - - self.log("writing to database ...\n") - self.addSNPEntrezRoles(oreganno_roles) - reg_ids = self.addBiopolymers(oreganno_regions) - self.addBiopolymerNamespacedNames(ns, ((reg_ids[i], oreganno_regions[i][1]) for i in range(len(reg_ids)))) - bound_gen = zip(((r,) for r in reg_ids),oreganno_bounds) - self.addBiopolymerLDProfileRegions(ldprofile_id, ((itertools.chain(*c) for c in bound_gen))) - - # Now, add the regulation groups - oreg_genes = list(oreganno_groups.keys()) - oreg_gids = self.addTypedGroups(group_typeid, ((subtypeID['-'], "regulatory_%s" % k, "OregAnno Regulation of %s" % k) for k in oreg_genes)) - self.addGroupNamespacedNames(ns, zip(oreg_gids, ("regulatory_%s" % k for k in oreg_genes))) - - group_membership = [] - for i in range(len(oreg_gids)): - gid = oreg_gids[i] - gene_key = oreg_genes[i] - gene_member = set() - tfbs_member = {} - member_num = 2 - for oreg_id in oreganno_groups[gene_key]: - member_num += 1 - group_membership.append((gid, member_num, oreganno_types.get(oreg_id, 0), ns, oreg_id)) - for external_nsid, external_val in oreg_gene.get(oreg_id,{}).items(): - gene_member.add((gid, 1, typeids['gene'], external_nsid, external_val)) - - member_num += 1 - for external_nsid, external_val in oreg_tfbs.get(oreg_id,{}).items(): - tfbs_member.setdefault(external_nsid,{})[external_val] = member_num - - group_membership.extend(gene_member) - for ext_ns, d in tfbs_member.items(): - for sym, mn in d.items(): - group_membership.append((gid, mn, typeids['gene'], ext_ns, sym)) - - self.addGroupMemberNames(group_membership) - - self.log("writing to database completed\n") - - # store source metadata - self.setSourceBuilds(None, 19) # TODO: check for latest FTP path rather than hardcoded /goldenPath/hg19/database/ - #update() - -#Source_oreganno + + _remHost = "hgdownload.cse.ucsc.edu" + _remPath = "/goldenPath/hg19/database/" + + _remFiles = ["oreganno.txt.gz", "oregannoAttr.txt.gz", "oregannoLink.txt.gz"] + + @classmethod + def getVersionString(cls): + return "2.1 (2016-09-19)" + + # getVersionString() + + def download(self, options, path): + """ + Download OregAnno from UCSC + """ + # self.downloadFilesFromFTP(self._remHost, dict(((f, self._remPath + f) for f in self._remFiles))) + self.downloadFilesFromHTTP( + self._remHost, + dict(((path + "/" + f, self._remPath + f) for f in self._remFiles)), + ) + + return [(path + "/" + f) for f in self._remFiles] + + def update(self, options, path): + """ + Update the database with the OregAnno data from ucsc + """ + + self.log("deleting old records from the database ...\n") + self.deleteAll() + self.log("deleting old records from the database completed\n") + + # Add the 'oreganno' namespace + ns = self.addNamespace("oreganno") + + # Add the ensembl and entrez namespaces + external_ns = self.addNamespaces( + [ + ("symbol", 0), + ("entrez_gid", 0), + ("ensembl_gid", 0), + ] + ) + + # Add the types of Regions + typeids = self.addTypes([("regulatory_region",), ("tfbs",), ("gene",)]) + subtypeID = self.addSubtypes( + [ + ("-",), + ] + ) + + # Add the types of groups + group_typeid = self.addType("regulatory_group") + + # Add the role for regulatory + snp_roleid = self.addRole( + "regulatory", "OregAnno Regulatory Polymorphism", 1, 0 + ) + + # Get the default population ID + ldprofile_id = self.addLDProfile("", "no LD adjustment") + + # build dict of gene id->oreganno id and a dict of + # oreganno id->entrez id and oreganno id->ensembl id + oreg_gene = {} + oreg_tfbs = {} + oreg_snp = {} + link_f = self.zfile(path + "/oregannoLink.txt.gz") + entrez_ns = external_ns["entrez_gid"] + ensembl_ns = external_ns["ensembl_gid"] + symbol_ns = external_ns["symbol"] + self.log("parsing external links ...\n") + for l in link_f: + fields = l.split() + if fields[1] == "Gene": + oreg_id = fields[0] + if fields[2] in ("EnsemblGene", "EnsemblId"): + gene_id = fields[3].split(",")[ + -1 + ] # used to be "Homo_sapiens,ENSG123" but now just "ENSG123" + oreg_gene.setdefault(oreg_id, {})[ensembl_ns] = gene_id + elif fields[2] in ("EntrezGene", "NCBIGene"): + gene_id = fields[3] + oreg_gene.setdefault(oreg_id, {})[entrez_ns] = gene_id + elif fields[1] == "TFbs": + oreg_id = fields[0] + if fields[2] in ("EnsemblGene", "EnsemblId"): + gene_id = fields[3].split(",")[ + -1 + ] # used to be "Homo_sapiens,ENSG123" but now just "ENSG123" + oreg_tfbs.setdefault(oreg_id, {})[ensembl_ns] = gene_id + elif fields[2] in ("EntrezGene", "NCBIGene"): + gene_id = fields[3] + oreg_tfbs.setdefault(oreg_id, {})[entrez_ns] = gene_id + elif fields[1] == "ExtLink" and fields[2] == "dbSNP": + # Just store the RS# (no leading "rs") + oreg_snp[fields[0]] = fields[3][2:] + # for l + self.log( + "parsing external links completed: %d genes, %d TFBs, %d SNPs\n" + % (len(oreg_gene), len(oreg_tfbs), len(oreg_snp)) + ) + + # Now, create a dict of oreganno id->type + oreganno_type = {} + self.log("parsing region attributes ...\n") + attr_f = self.zfile(path + "/oregannoAttr.txt.gz") + for l in attr_f: + fields = l.split("\t") + if fields[1] == "type": + oreganno_type[fields[0]] = fields[2] + elif fields[1] == "Gene": + oreg_gene.setdefault(fields[0], {})[symbol_ns] = fields[2] + elif fields[1] == "TFbs": + oreg_tfbs.setdefault(fields[0], {})[symbol_ns] = fields[2] + # for l + self.log( + "parsing region attributes completed: %d genes, %d TFBs\n" + % (len(oreg_gene), len(oreg_tfbs)) + ) + + # OK, now parse the actual regions themselves + region_f = self.zfile(path + "/oreganno.txt.gz") + oreganno_roles = [] + oreganno_regions = [] + oreganno_bounds = [] + oreganno_groups = {} + oreganno_types = {} + self.log("parsing regulatory regions ...\n") + snps_unmapped = 0 + for l in region_f: + fields = l.split() + chrom = self._loki.chr_num.get(fields[1][3:]) + start = int(fields[2]) + 1 + stop = int(fields[3]) + oreg_id = fields[4] + oreg_type = oreganno_type[ + oreg_id + ].upper() # used to be CAPS, now Title Case + if chrom and oreg_type == "REGULATORY POLYMORPHISM": + entrez_id = oreg_gene.get(oreg_id, {}).get(entrez_ns) + rsid = oreg_snp.get(oreg_id) + if entrez_id and rsid: + oreganno_roles.append((int(rsid), entrez_id, snp_roleid)) + else: + snps_unmapped += 1 + elif chrom and ( + oreg_type == "REGULATORY REGION" + or oreg_type == "TRANSCRIPTION FACTOR BINDING SITE" + ): + gene_symbol = oreg_gene.get(oreg_id, {}).get(symbol_ns) + if not gene_symbol: + gene_symbol = oreg_tfbs.get(oreg_id, {}).get(symbol_ns) + + if gene_symbol: + oreganno_groups.setdefault(gene_symbol, []).append(oreg_id) + + if oreg_type == "REGULATORY REGION": + oreg_typeid = typeids["regulatory_region"] + else: + oreg_typeid = typeids["tfbs"] + + oreganno_types[oreg_id] = oreg_typeid + oreganno_regions.append((oreg_typeid, oreg_id, "")) + oreganno_bounds.append((chrom, start, stop)) + # if chrom and oreg_type + # for l + self.log( + "parsing regulatory regions completed (%d regions found, %d SNPs found, %d SNPs unmapped)\n" + % (len(oreganno_regions), len(oreganno_roles), snps_unmapped) + ) + + self.log("writing to database ...\n") + self.addSNPEntrezRoles(oreganno_roles) + reg_ids = self.addBiopolymers(oreganno_regions) + self.addBiopolymerNamespacedNames( + ns, ((reg_ids[i], oreganno_regions[i][1]) for i in range(len(reg_ids))) + ) + bound_gen = zip(((r,) for r in reg_ids), oreganno_bounds) + self.addBiopolymerLDProfileRegions( + ldprofile_id, ((itertools.chain(*c) for c in bound_gen)) + ) + + # Now, add the regulation groups + oreg_genes = list(oreganno_groups.keys()) + oreg_gids = self.addTypedGroups( + group_typeid, + ( + (subtypeID["-"], "regulatory_%s" % k, "OregAnno Regulation of %s" % k) + for k in oreg_genes + ), + ) + self.addGroupNamespacedNames( + ns, zip(oreg_gids, ("regulatory_%s" % k for k in oreg_genes)) + ) + + group_membership = [] + for i in range(len(oreg_gids)): + gid = oreg_gids[i] + gene_key = oreg_genes[i] + gene_member = set() + tfbs_member = {} + member_num = 2 + for oreg_id in oreganno_groups[gene_key]: + member_num += 1 + group_membership.append( + (gid, member_num, oreganno_types.get(oreg_id, 0), ns, oreg_id) + ) + for external_nsid, external_val in oreg_gene.get(oreg_id, {}).items(): + gene_member.add( + (gid, 1, typeids["gene"], external_nsid, external_val) + ) + + member_num += 1 + for external_nsid, external_val in oreg_tfbs.get(oreg_id, {}).items(): + tfbs_member.setdefault(external_nsid, {})[external_val] = member_num + + group_membership.extend(gene_member) + for ext_ns, d in tfbs_member.items(): + for sym, mn in d.items(): + group_membership.append((gid, mn, typeids["gene"], ext_ns, sym)) + + self.addGroupMemberNames(group_membership) + + self.log("writing to database completed\n") + + # store source metadata + self.setSourceBuilds( + None, 19 + ) # TODO: check for latest FTP path rather than hardcoded /goldenPath/hg19/database/ + + # update() + + +# Source_oreganno diff --git a/loki/loaders/loki_source_pfam.py b/loki/loaders/loki_source_pfam.py index 0ac96b2..6a481dd 100644 --- a/loki/loaders/loki_source_pfam.py +++ b/loki/loaders/loki_source_pfam.py @@ -5,173 +5,222 @@ class Source_pfam(loki_source.Source): - - - @classmethod - def getVersionString(cls): - return '2.2 (2016-02-08)' - #getVersionString() - - - def download(self, options, path): - # download the latest source files -# self.downloadFilesFromFTP('ftp.ebi.ac.uk', { -# 'pfamA.txt.gz': '/pub/databases/Pfam/current_release/database_files/pfamA.txt.gz', -# 'pfamA_reg_full_significant.txt.gz': '/pub/databases/Pfam/current_release/database_files/pfamA_reg_full_significant.txt.gz', -# 'pfamseq.txt.gz': '/pub/databases/Pfam/current_release/database_files/pfamseq.txt.gz', -# }) - self.downloadFilesFromHTTP('ftp.ebi.ac.uk', { - path+'/pfamA.txt.gz': '/pub/databases/Pfam/current_release/database_files/pfamA.txt.gz', - path+'/pfamA_reg_full_significant.txt.gz': '/pub/databases/Pfam/current_release/database_files/pfamA_reg_full_significant.txt.gz', - path+'/pfamseq.txt.gz': '/pub/databases/Pfam/current_release/database_files/pfamseq.txt.gz', - }) - - return [ - path+'/pfamA.txt.gz', - path+'/pfamA_reg_full_significant.txt.gz', - path+'/pfamseq.txt.gz' - ] - #download() - - - def update(self, options, path): - # clear out all old data from this source - self.log("deleting old records from the database ...\n") - self.deleteAll() - self.log("deleting old records from the database completed\n") - - # get or create the required metadata records - namespaceID = self.addNamespaces([ - ('pfam_id', 0), - ('proteinfamily', 0), - ('uniprot_pid', 1), - ]) - relationshipID = self.addRelationships([ - ('',), - ]) - typeID = self.addTypes([ - ('proteinfamily',), - ('gene',), - ]) - subtypeID = self.addSubtypes([ - ('-',), - ]) - - # process protein families - self.log("processing protein families ...\n") - pfamFile = self.zfile(path+'/pfamA.txt.gz') #TODO:context manager,iterator - groupFam = collections.defaultdict(set) - famAcc = {} - famID = {} - famName = {} - famDesc = {} - for line in pfamFile: - words = line.split("\t",10) - pfamNum = words[0].strip() - if pfamNum.isdigit(): - pfamNum = int(pfamNum) # auto_pfamA = 1 , 2 , ... - pfamAcc = words[1].strip() # pfamA_acc = PF00389 , PF00198 , ... - pfamID = words[2].strip() # pfamA_id = 2-Hacid_dh , 2-oxoacid_dh , ... - name = words[4].strip() # description = D-isomer specific 2-hydroxyacid dehydrogenase, catalytic domain , ... - group = words[8].strip() # type = Domain , Family , Motif , Repeat - desc = words[9].strip() # comment = (long description) - else: - # starting in release 28, all the "auto" columns were dropped - pfamAcc = pfamNum - pfamID = words[1].strip() # 2-Hacid_dh , 2-oxoacid_dh , ... - name = words[3].strip() # D-isomer specific 2-hydroxyacid dehydrogenase, catalytic domain , ... - group = words[7].strip() # Domain , Family , Motif , Repeat - desc = words[8].strip() # (long description) - - groupFam[group].add(pfamNum) - famAcc[pfamNum] = pfamAcc - famID[pfamNum] = pfamID - famName[pfamNum] = name - famDesc[pfamNum] = desc - numGroup = len(groupFam) - numFam = len(famName) - self.log("processing protein families completed: %d categories, %d families\n" % (numGroup,numFam)) - - # store protein families - self.log("writing protein families to the database ...\n") - listGroup = groupFam.keys() - listGID = self.addTypedGroups(typeID['proteinfamily'], ((subtypeID['-'], group,"") for group in listGroup)) - groupGID = dict(zip(listGroup,listGID)) - listFam = famAcc.keys() - listGID = self.addTypedGroups(typeID['proteinfamily'], ((subtypeID['-'], famName[fam],famDesc[fam]) for fam in listFam)) - famGID = dict(zip(listFam,listGID)) - self.log("writing protein families to the database completed\n") - - # store protein family names - self.log("writing protein family names to the database ...\n") - self.addGroupNamespacedNames(namespaceID['pfam_id'], ((groupGID[group],group) for group in listGroup)) - self.addGroupNamespacedNames(namespaceID['pfam_id'], ((famGID[fam],famAcc[fam]) for fam in listFam)) - self.addGroupNamespacedNames(namespaceID['proteinfamily'], ((famGID[fam],famID[fam]) for fam in listFam)) - self.addGroupNamespacedNames(namespaceID['proteinfamily'], ((famGID[fam],famName[fam]) for fam in listFam)) - famName = famDesc = None - self.log("writing protein family names to the database completed\n") - - # store protein family meta-group links - self.log("writing protein family links to the database ...\n") - for group in groupFam: - self.addGroupRelationships( (famGID[fam],groupGID[group],relationshipID[''],None) for fam in groupFam[group] ) - groupFam = None - self.log("writing protein family links to the database completed\n") - - # process protein identifiers - self.log("processing protein identifiers ...\n") - seqFile = self.zfile(path+'/pfamseq.txt.gz') #TODO:context manager,iterator - proNames = dict() - for line in seqFile: - words = line.split("\t",10) - proteinNum = words[0].strip() - if proteinNum.isdigit(): - proteinNum = int(proteinNum) # auto_pfamseq = 1 , 2 , ... - uniprotID = words[1] # pfamseq_id = 1433B_HUMAN , GATC_HUMAN , ... - uniprotAcc = words[2] # pfamseq_acc = P31946 , O43716 , ... - species = words[9] # species = Homo sapiens (Human) - else: - # starting in release 28, all the "auto" columns were dropped - uniprotID = proteinNum # pfamseq_id = 1433B_HUMAN , GATC_HUMAN , ... - uniprotAcc = words[1] # pfamseq_acc = P31946 , O43716 , ... - species = words[8] # species = Homo sapiens (Human) - - if species == 'Homo sapiens (Human)': - proNames[proteinNum] = (uniprotID,uniprotAcc) - #foreach protein - self.log("processing protein identifiers completed: %d proteins\n" % (len(proNames),)) - - # process associations - self.log("processing protein associations ...\n") - assocFile = self.zfile(path+'/pfamA_reg_full_significant.txt.gz') #TODO:context manager,iterator - setAssoc = set() - numAssoc = numID = 0 - for line in assocFile: - words = line.split("\t",15) - pfamNum = words[1].strip() - if pfamNum.isdigit(): - pfamNum = int(pfamNum) # auto_pfamA - proteinNum = int(words[2]) # auto_pfamseq - inFull = int(words[14]) # in_full - else: - # starting in release 28, all the "auto" columns were dropped - pfamNum = pfamNum # pfamA_acc - proteinNum = words[2].strip() # pfamseq_acc - inFull = int(words[14]) # in_full - - if (pfamNum in famGID) and (proteinNum in proNames) and inFull: - numAssoc += 1 - numID += len(proNames[proteinNum]) - for name in proNames[proteinNum]: - setAssoc.add( (famGID[pfamNum],numAssoc,name) ) - #if association is ok - #foreach association - self.log("processing protein associations completed: %d associations (%d identifiers)\n" % (numAssoc,numID)) - - # store gene associations - self.log("writing gene associations to the database ...\n") - self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID['uniprot_pid'], setAssoc) - self.log("writing gene associations to the database completed\n") - #update() - -#Source_pfam + + @classmethod + def getVersionString(cls): + return "2.2 (2016-02-08)" + + # getVersionString() + + def download(self, options, path): + # download the latest source files + # self.downloadFilesFromFTP('ftp.ebi.ac.uk', { + # 'pfamA.txt.gz': '/pub/databases/Pfam/current_release/database_files/pfamA.txt.gz', + # 'pfamA_reg_full_significant.txt.gz': '/pub/databases/Pfam/current_release/database_files/pfamA_reg_full_significant.txt.gz', + # 'pfamseq.txt.gz': '/pub/databases/Pfam/current_release/database_files/pfamseq.txt.gz', + # }) + self.downloadFilesFromHTTP( + "ftp.ebi.ac.uk", + { + path + + "/pfamA.txt.gz": "/pub/databases/Pfam/current_release/database_files/pfamA.txt.gz", + path + + "/pfamA_reg_full_significant.txt.gz": "/pub/databases/Pfam/current_release/database_files/pfamA_reg_full_significant.txt.gz", + path + + "/pfamseq.txt.gz": "/pub/databases/Pfam/current_release/database_files/pfamseq.txt.gz", + }, + ) + + return [ + path + "/pfamA.txt.gz", + path + "/pfamA_reg_full_significant.txt.gz", + path + "/pfamseq.txt.gz", + ] + + # download() + + def update(self, options, path): + # clear out all old data from this source + self.log("deleting old records from the database ...\n") + self.deleteAll() + self.log("deleting old records from the database completed\n") + + # get or create the required metadata records + namespaceID = self.addNamespaces( + [ + ("pfam_id", 0), + ("proteinfamily", 0), + ("uniprot_pid", 1), + ] + ) + relationshipID = self.addRelationships( + [ + ("",), + ] + ) + typeID = self.addTypes( + [ + ("proteinfamily",), + ("gene",), + ] + ) + subtypeID = self.addSubtypes( + [ + ("-",), + ] + ) + + # process protein families + self.log("processing protein families ...\n") + pfamFile = self.zfile(path + "/pfamA.txt.gz") # TODO:context manager,iterator + groupFam = collections.defaultdict(set) + famAcc = {} + famID = {} + famName = {} + famDesc = {} + for line in pfamFile: + words = line.split("\t", 10) + pfamNum = words[0].strip() + if pfamNum.isdigit(): + pfamNum = int(pfamNum) # auto_pfamA = 1 , 2 , ... + pfamAcc = words[1].strip() # pfamA_acc = PF00389 , PF00198 , ... + pfamID = words[2].strip() # pfamA_id = 2-Hacid_dh , 2-oxoacid_dh , ... + name = words[ + 4 + ].strip() # description = D-isomer specific 2-hydroxyacid dehydrogenase, catalytic domain , ... + group = words[8].strip() # type = Domain , Family , Motif , Repeat + desc = words[9].strip() # comment = (long description) + else: + # starting in release 28, all the "auto" columns were dropped + pfamAcc = pfamNum + pfamID = words[1].strip() # 2-Hacid_dh , 2-oxoacid_dh , ... + name = words[ + 3 + ].strip() # D-isomer specific 2-hydroxyacid dehydrogenase, catalytic domain , ... + group = words[7].strip() # Domain , Family , Motif , Repeat + desc = words[8].strip() # (long description) + + groupFam[group].add(pfamNum) + famAcc[pfamNum] = pfamAcc + famID[pfamNum] = pfamID + famName[pfamNum] = name + famDesc[pfamNum] = desc + numGroup = len(groupFam) + numFam = len(famName) + self.log( + "processing protein families completed: %d categories, %d families\n" + % (numGroup, numFam) + ) + + # store protein families + self.log("writing protein families to the database ...\n") + listGroup = groupFam.keys() + listGID = self.addTypedGroups( + typeID["proteinfamily"], + ((subtypeID["-"], group, "") for group in listGroup), + ) + groupGID = dict(zip(listGroup, listGID)) + listFam = famAcc.keys() + listGID = self.addTypedGroups( + typeID["proteinfamily"], + ((subtypeID["-"], famName[fam], famDesc[fam]) for fam in listFam), + ) + famGID = dict(zip(listFam, listGID)) + self.log("writing protein families to the database completed\n") + + # store protein family names + self.log("writing protein family names to the database ...\n") + self.addGroupNamespacedNames( + namespaceID["pfam_id"], ((groupGID[group], group) for group in listGroup) + ) + self.addGroupNamespacedNames( + namespaceID["pfam_id"], ((famGID[fam], famAcc[fam]) for fam in listFam) + ) + self.addGroupNamespacedNames( + namespaceID["proteinfamily"], ((famGID[fam], famID[fam]) for fam in listFam) + ) + self.addGroupNamespacedNames( + namespaceID["proteinfamily"], + ((famGID[fam], famName[fam]) for fam in listFam), + ) + famName = famDesc = None + self.log("writing protein family names to the database completed\n") + + # store protein family meta-group links + self.log("writing protein family links to the database ...\n") + for group in groupFam: + self.addGroupRelationships( + (famGID[fam], groupGID[group], relationshipID[""], None) + for fam in groupFam[group] + ) + groupFam = None + self.log("writing protein family links to the database completed\n") + + # process protein identifiers + self.log("processing protein identifiers ...\n") + seqFile = self.zfile(path + "/pfamseq.txt.gz") # TODO:context manager,iterator + proNames = dict() + for line in seqFile: + words = line.split("\t", 10) + proteinNum = words[0].strip() + if proteinNum.isdigit(): + proteinNum = int(proteinNum) # auto_pfamseq = 1 , 2 , ... + uniprotID = words[1] # pfamseq_id = 1433B_HUMAN , GATC_HUMAN , ... + uniprotAcc = words[2] # pfamseq_acc = P31946 , O43716 , ... + species = words[9] # species = Homo sapiens (Human) + else: + # starting in release 28, all the "auto" columns were dropped + uniprotID = proteinNum # pfamseq_id = 1433B_HUMAN , GATC_HUMAN , ... + uniprotAcc = words[1] # pfamseq_acc = P31946 , O43716 , ... + species = words[8] # species = Homo sapiens (Human) + + if species == "Homo sapiens (Human)": + proNames[proteinNum] = (uniprotID, uniprotAcc) + # foreach protein + self.log( + "processing protein identifiers completed: %d proteins\n" % (len(proNames),) + ) + + # process associations + self.log("processing protein associations ...\n") + assocFile = self.zfile( + path + "/pfamA_reg_full_significant.txt.gz" + ) # TODO:context manager,iterator + setAssoc = set() + numAssoc = numID = 0 + for line in assocFile: + words = line.split("\t", 15) + pfamNum = words[1].strip() + if pfamNum.isdigit(): + pfamNum = int(pfamNum) # auto_pfamA + proteinNum = int(words[2]) # auto_pfamseq + inFull = int(words[14]) # in_full + else: + # starting in release 28, all the "auto" columns were dropped + pfamNum = pfamNum # pfamA_acc + proteinNum = words[2].strip() # pfamseq_acc + inFull = int(words[14]) # in_full + + if (pfamNum in famGID) and (proteinNum in proNames) and inFull: + numAssoc += 1 + numID += len(proNames[proteinNum]) + for name in proNames[proteinNum]: + setAssoc.add((famGID[pfamNum], numAssoc, name)) + # if association is ok + # foreach association + self.log( + "processing protein associations completed: %d associations (%d identifiers)\n" + % (numAssoc, numID) + ) + + # store gene associations + self.log("writing gene associations to the database ...\n") + self.addGroupMemberTypedNamespacedNames( + typeID["gene"], namespaceID["uniprot_pid"], setAssoc + ) + self.log("writing gene associations to the database completed\n") + + # update() + + +# Source_pfam diff --git a/loki/loaders/loki_source_pharmgkb.py b/loki/loaders/loki_source_pharmgkb.py index f10f3e5..ece945e 100644 --- a/loki/loaders/loki_source_pharmgkb.py +++ b/loki/loaders/loki_source_pharmgkb.py @@ -5,235 +5,297 @@ class Source_pharmgkb(loki_source.Source): - - - @classmethod - def getVersionString(cls): - return '2.3 (2018-10-30)' - #getVersionString() - - - def download(self, options, path): - self.downloadFilesFromHTTPS('api.pharmgkb.org', { - path+'/genes.zip': '/v1/download/file/data/genes.zip', - path+'/pathways-tsv.zip': '/v1/download/file/data/pathways-tsv.zip', - }) - - return [ - path+'/genes.zip', - path+'/pathways-tsv.zip' - ] - #download() - - - def update(self, options, path): - # clear out all old data from this source - self.log("deleting old records from the database ...\n") - self.deleteAll() - self.log("deleting old records from the database completed\n") - - # get or create the required metadata records - namespaceID = self.addNamespaces([ - ('pharmgkb_id', 0), - ('pathway', 0), - ('pharmgkb_gid', 0), - ('symbol', 0), - ('entrez_gid', 0), - ('refseq_gid', 0), - ('refseq_pid', 1), - ('ensembl_gid', 0), - ('ensembl_pid', 1), - ('hgnc_id', 0), - ('uniprot_gid', 0), - ('uniprot_pid', 1), - ]) - typeID = self.addTypes([ - ('gene',), - ('pathway',), - ]) - subtypeID = self.addSubtypes([ - ('-',), - ]) - - # process gene names - self.log("verifying gene name archive file ...\n") - setNames = set() - empty = tuple() - with zipfile.ZipFile(path+'/genes.zip','r') as geneZip: - err = geneZip.testzip() - if err: - self.log(" ERROR\n") - self.log("CRC failed for %s\n" % err) - return False - self.log("verifying gene name archive file completed\n") - self.log("processing gene names ...\n") - xrefNS = { - 'entrezGene': ('entrez_gid',), - 'refSeqDna': ('refseq_gid',), - 'refSeqRna': ('refseq_gid',), - 'refSeqProtein': ('refseq_pid',), - 'ensembl': ('ensembl_gid','ensembl_pid'), - 'hgnc': ('hgnc_id',), - 'uniProtKb': ('uniprot_gid','uniprot_pid'), - } - for info in geneZip.infolist(): - if info.filename == 'genes.tsv': - geneFile = geneZip.open(info,'r') - header = geneFile.__next__().rstrip() - if header.decode().startswith("PharmGKB Accession Id Entrez Id Ensembl Id Name Symbol Alternate Names Alternate Symbols Is VIP Has Variant Annotation Cross-references"): - new2 = 0 - elif header.decode().startswith("PharmGKB Accession Id NCBI Gene ID HGNC ID Ensembl Id Name Symbol Alternate Names Alternate Symbols Is VIP Has Variant Annotation Cross-references"): - new2 = 1 - else: - raise Exception("ERROR: unrecognized file header in '%s': %s" % (info.filename,header)) - for line in geneFile: - words = line.decode('latin-1').split("\t") - pgkbID = words[0] - entrezID = words[1] - ensemblID = words[2+new2] - symbol = words[4+new2] - aliases = words[6+new2].split(',') if words[6+new2] != "" else empty - xrefs = words[9+new2].strip(', \r\n').split(',') if words[9+new2] != "" else empty - - if entrezID: - setNames.add( (namespaceID['entrez_gid'],entrezID,pgkbID) ) - if ensemblID: - setNames.add( (namespaceID['ensembl_gid'],ensemblID,pgkbID) ) - setNames.add( (namespaceID['ensembl_pid'],ensemblID,pgkbID) ) - if symbol: - setNames.add( (namespaceID['symbol'],symbol,pgkbID) ) - for alias in aliases: - #line.decode('latin-1') should handle this above - #setNames.add( (namespaceID['symbol'],unicode(alias.strip('" '),errors='ignore'),pgkbID) ) - setNames.add( (namespaceID['symbol'],alias.strip('" '),pgkbID) ) - for xref in xrefs: - try: - xrefDB,xrefID = xref.split(':',1) - if xrefDB in xrefNS: - for ns in xrefNS[xrefDB]: - setNames.add( (namespaceID[ns],xrefID,pgkbID) ) - #line.decode('latin-1') should handle this above - #try: - # xrefID.encode('ascii') - # setNames.add( (namespaceID[ns],xrefID.decode('utf8').encode('ascii'),pgkbID) ) - #except: - # self.log("Cannot encode gene alias") - except ValueError: - pass - #foreach line in geneFile - geneFile.close() - #if genes.tsv - #foreach file in geneZip - #with geneZip - numIDs = len(set(n[2] for n in setNames)) - self.log("processing gene names completed: %d identifiers (%d references)\n" % (numIDs,len(setNames))) - - # store gene names - self.log("writing gene names to the database ...\n") - self.addBiopolymerTypedNameNamespacedNames(typeID['gene'], namespaceID['pharmgkb_gid'], setNames) - self.log("writing gene names to the database completed\n") - setNames = None - - # process pathways - self.log("verifying pathway archive file ...\n") - pathDesc = {} - nsAssoc = { - 'pharmgkb_gid': set(), - 'symbol': set(), - } - numAssoc = numID = 0 - with zipfile.ZipFile(path+'/pathways-tsv.zip','r') as pathZip: - err = pathZip.testzip() - if err: - self.log(" ERROR\n") - self.log("CRC failed for %s\n" % err) - return False - self.log("verifying pathway archive file completed\n") - self.log("processing pathways ...\n") - for info in pathZip.infolist(): - if info.filename == 'pathways.tsv': - # the old format had all pathways in one giant file, delimited by blank lines - pathFile = pathZip.open(path+'/'+info,'r') - curPath = None - lastline = "" - for line in pathFile: - line = line.decode('latin-1').rstrip("\r\n") - if line == "" and lastline == "": - curPath = None - elif curPath is None: - words = line.split(':',1) - if len(words) >= 2: - curPath = words[0].strip() - desc = words[1].strip().rsplit(' - ',1) - desc.append('') - #line.decode('latin-1') should handle this above - #pathDesc[curPath] = (unicode(desc[0].strip(),errors='ignore'),unicode(desc[1].strip(),errors='ignore')) - pathDesc[curPath] = (desc[0].strip().replace("`", "'"),desc[1].strip().replace("`", "'")) - elif curPath is False: - pass - else: - words = line.split("\t") - if words[0] == "From": - curPath = False - elif words[0] == "Gene": - pgkbID = words[1] - symbol = words[2] - - numAssoc += 1 - numID += 2 - nsAssoc['pharmgkb_gid'].add( (curPath,numAssoc,pgkbID) ) - nsAssoc['symbol'].add( (curPath,numAssoc,symbol) ) - #if assoc is Gene - lastline = line - #foreach line in pathFile - pathFile.close() - elif info.filename.endswith('.tsv'): - # the new format has separate "PA###-***.tsv" files for each pathway - pathFile = pathZip.open(info,'r') - header = next(pathFile) - if header.decode().startswith("From To Reaction Type Controller Control Type Cell Type PubMed Id Genes"): # Drugs Diseases - pass - elif header.decode().startswith("From To Reaction Type Controller Control Type Cell Type PMIDs Genes"): # Drugs Diseases - pass - else: - raise Exception("ERROR: unrecognized file header in '%s': %s" % (info.filename,header)) - parts = info.filename.split('-') - curPath = parts[0] - parts = parts[1].split('.') - pathDesc[curPath] = (subtypeID['-'], parts[0].replace("_"," "),None) - for line in pathFile: - for symbol in line.decode('latin-1').split("\t")[7].split(","): - numAssoc += 1 - numID += 1 - nsAssoc['symbol'].add( (curPath,numAssoc,symbol.strip('"')) ) - #foreach line in pathFile - pathFile.close() - #if pathways.tsv - #foreach file in pathZip - #with pathZip - self.log("processing pathways completed: %d pathways, %d associations (%d identifiers)\n" % (len(pathDesc),numAssoc,numID)) - - # store pathways - self.log("writing pathways to the database ...\n") - listPath = pathDesc.keys() - listGID = self.addTypedGroups(typeID['pathway'], (pathDesc[path] for path in listPath)) - pathGID = dict(zip(listPath,listGID)) - self.log("writing pathways to the database completed\n") - - # store pathway names - self.log("writing pathway names to the database ...\n") - self.addGroupNamespacedNames(namespaceID['pharmgkb_id'], ((pathGID[path],path) for path in listPath)) - self.addGroupNamespacedNames(namespaceID['pathway'], ((pathGID[path],pathDesc[path][0]) for path in listPath)) - self.log("writing pathway names to the database completed\n") - - # store gene associations - self.log("writing gene associations to the database ...\n") - for ns in nsAssoc: - self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID[ns], ((pathGID[a[0]],a[1],a[2]) for a in nsAssoc[ns]) ) - self.log("writing gene associations to the database completed\n") - - #TODO: eventually add diseases, drugs, relationships - - #update() - -#Source_pharmgkb + + @classmethod + def getVersionString(cls): + return "2.3 (2018-10-30)" + + # getVersionString() + + def download(self, options, path): + self.downloadFilesFromHTTPS( + "api.pharmgkb.org", + { + path + "/genes.zip": "/v1/download/file/data/genes.zip", + path + "/pathways-tsv.zip": "/v1/download/file/data/pathways-tsv.zip", + }, + ) + + return [path + "/genes.zip", path + "/pathways-tsv.zip"] + + # download() + + def update(self, options, path): + # clear out all old data from this source + self.log("deleting old records from the database ...\n") + self.deleteAll() + self.log("deleting old records from the database completed\n") + + # get or create the required metadata records + namespaceID = self.addNamespaces( + [ + ("pharmgkb_id", 0), + ("pathway", 0), + ("pharmgkb_gid", 0), + ("symbol", 0), + ("entrez_gid", 0), + ("refseq_gid", 0), + ("refseq_pid", 1), + ("ensembl_gid", 0), + ("ensembl_pid", 1), + ("hgnc_id", 0), + ("uniprot_gid", 0), + ("uniprot_pid", 1), + ] + ) + typeID = self.addTypes( + [ + ("gene",), + ("pathway",), + ] + ) + subtypeID = self.addSubtypes( + [ + ("-",), + ] + ) + + # process gene names + self.log("verifying gene name archive file ...\n") + setNames = set() + empty = tuple() + with zipfile.ZipFile(path + "/genes.zip", "r") as geneZip: + err = geneZip.testzip() + if err: + self.log(" ERROR\n") + self.log("CRC failed for %s\n" % err) + return False + self.log("verifying gene name archive file completed\n") + self.log("processing gene names ...\n") + xrefNS = { + "entrezGene": ("entrez_gid",), + "refSeqDna": ("refseq_gid",), + "refSeqRna": ("refseq_gid",), + "refSeqProtein": ("refseq_pid",), + "ensembl": ("ensembl_gid", "ensembl_pid"), + "hgnc": ("hgnc_id",), + "uniProtKb": ("uniprot_gid", "uniprot_pid"), + } + for info in geneZip.infolist(): + if info.filename == "genes.tsv": + geneFile = geneZip.open(info, "r") + header = geneFile.__next__().rstrip() + if header.decode().startswith( + "PharmGKB Accession Id Entrez Id Ensembl Id Name Symbol Alternate Names Alternate Symbols Is VIP Has Variant Annotation Cross-references" + ): + new2 = 0 + elif header.decode().startswith( + "PharmGKB Accession Id NCBI Gene ID HGNC ID Ensembl Id Name Symbol Alternate Names Alternate Symbols Is VIP Has Variant Annotation Cross-references" + ): + new2 = 1 + else: + raise Exception( + "ERROR: unrecognized file header in '%s': %s" + % (info.filename, header) + ) + for line in geneFile: + words = line.decode("latin-1").split("\t") + pgkbID = words[0] + entrezID = words[1] + ensemblID = words[2 + new2] + symbol = words[4 + new2] + aliases = ( + words[6 + new2].split(",") + if words[6 + new2] != "" + else empty + ) + xrefs = ( + words[9 + new2].strip(", \r\n").split(",") + if words[9 + new2] != "" + else empty + ) + + if entrezID: + setNames.add((namespaceID["entrez_gid"], entrezID, pgkbID)) + if ensemblID: + setNames.add( + (namespaceID["ensembl_gid"], ensemblID, pgkbID) + ) + setNames.add( + (namespaceID["ensembl_pid"], ensemblID, pgkbID) + ) + if symbol: + setNames.add((namespaceID["symbol"], symbol, pgkbID)) + for alias in aliases: + # line.decode('latin-1') should handle this above + # setNames.add( (namespaceID['symbol'],unicode(alias.strip('" '),errors='ignore'),pgkbID) ) + setNames.add( + (namespaceID["symbol"], alias.strip('" '), pgkbID) + ) + for xref in xrefs: + try: + xrefDB, xrefID = xref.split(":", 1) + if xrefDB in xrefNS: + for ns in xrefNS[xrefDB]: + setNames.add((namespaceID[ns], xrefID, pgkbID)) + # line.decode('latin-1') should handle this above + # try: + # xrefID.encode('ascii') + # setNames.add( (namespaceID[ns],xrefID.decode('utf8').encode('ascii'),pgkbID) ) + # except: + # self.log("Cannot encode gene alias") + except ValueError: + pass + # foreach line in geneFile + geneFile.close() + # if genes.tsv + # foreach file in geneZip + # with geneZip + numIDs = len(set(n[2] for n in setNames)) + self.log( + "processing gene names completed: %d identifiers (%d references)\n" + % (numIDs, len(setNames)) + ) + + # store gene names + self.log("writing gene names to the database ...\n") + self.addBiopolymerTypedNameNamespacedNames( + typeID["gene"], namespaceID["pharmgkb_gid"], setNames + ) + self.log("writing gene names to the database completed\n") + setNames = None + + # process pathways + self.log("verifying pathway archive file ...\n") + pathDesc = {} + nsAssoc = { + "pharmgkb_gid": set(), + "symbol": set(), + } + numAssoc = numID = 0 + with zipfile.ZipFile(path + "/pathways-tsv.zip", "r") as pathZip: + err = pathZip.testzip() + if err: + self.log(" ERROR\n") + self.log("CRC failed for %s\n" % err) + return False + self.log("verifying pathway archive file completed\n") + self.log("processing pathways ...\n") + for info in pathZip.infolist(): + if info.filename == "pathways.tsv": + # the old format had all pathways in one giant file, delimited by blank lines + pathFile = pathZip.open(path + "/" + info, "r") + curPath = None + lastline = "" + for line in pathFile: + line = line.decode("latin-1").rstrip("\r\n") + if line == "" and lastline == "": + curPath = None + elif curPath is None: + words = line.split(":", 1) + if len(words) >= 2: + curPath = words[0].strip() + desc = words[1].strip().rsplit(" - ", 1) + desc.append("") + # line.decode('latin-1') should handle this above + # pathDesc[curPath] = (unicode(desc[0].strip(),errors='ignore'),unicode(desc[1].strip(),errors='ignore')) + pathDesc[curPath] = ( + desc[0].strip().replace("`", "'"), + desc[1].strip().replace("`", "'"), + ) + elif curPath is False: + pass + else: + words = line.split("\t") + if words[0] == "From": + curPath = False + elif words[0] == "Gene": + pgkbID = words[1] + symbol = words[2] + + numAssoc += 1 + numID += 2 + nsAssoc["pharmgkb_gid"].add((curPath, numAssoc, pgkbID)) + nsAssoc["symbol"].add((curPath, numAssoc, symbol)) + # if assoc is Gene + lastline = line + # foreach line in pathFile + pathFile.close() + elif info.filename.endswith(".tsv"): + # the new format has separate "PA###-***.tsv" files for each pathway + pathFile = pathZip.open(info, "r") + header = next(pathFile) + if header.decode().startswith( + "From To Reaction Type Controller Control Type Cell Type PubMed Id Genes" + ): # Drugs Diseases + pass + elif header.decode().startswith( + "From To Reaction Type Controller Control Type Cell Type PMIDs Genes" + ): # Drugs Diseases + pass + else: + raise Exception( + "ERROR: unrecognized file header in '%s': %s" + % (info.filename, header) + ) + parts = info.filename.split("-") + curPath = parts[0] + parts = parts[1].split(".") + pathDesc[curPath] = ( + subtypeID["-"], + parts[0].replace("_", " "), + None, + ) + for line in pathFile: + for symbol in line.decode("latin-1").split("\t")[7].split(","): + numAssoc += 1 + numID += 1 + nsAssoc["symbol"].add( + (curPath, numAssoc, symbol.strip('"')) + ) + # foreach line in pathFile + pathFile.close() + # if pathways.tsv + # foreach file in pathZip + # with pathZip + self.log( + "processing pathways completed: %d pathways, %d associations (%d identifiers)\n" + % (len(pathDesc), numAssoc, numID) + ) + + # store pathways + self.log("writing pathways to the database ...\n") + listPath = pathDesc.keys() + listGID = self.addTypedGroups( + typeID["pathway"], (pathDesc[path] for path in listPath) + ) + pathGID = dict(zip(listPath, listGID)) + self.log("writing pathways to the database completed\n") + + # store pathway names + self.log("writing pathway names to the database ...\n") + self.addGroupNamespacedNames( + namespaceID["pharmgkb_id"], ((pathGID[path], path) for path in listPath) + ) + self.addGroupNamespacedNames( + namespaceID["pathway"], + ((pathGID[path], pathDesc[path][0]) for path in listPath), + ) + self.log("writing pathway names to the database completed\n") + + # store gene associations + self.log("writing gene associations to the database ...\n") + for ns in nsAssoc: + self.addGroupMemberTypedNamespacedNames( + typeID["gene"], + namespaceID[ns], + ((pathGID[a[0]], a[1], a[2]) for a in nsAssoc[ns]), + ) + self.log("writing gene associations to the database completed\n") + + # TODO: eventually add diseases, drugs, relationships + + # update() + + +# Source_pharmgkb diff --git a/loki/loaders/loki_source_reactome.py b/loki/loaders/loki_source_reactome.py index c63576a..c6a09bc 100644 --- a/loki/loaders/loki_source_reactome.py +++ b/loki/loaders/loki_source_reactome.py @@ -7,361 +7,489 @@ class Source_reactome(loki_source.Source): - - - @classmethod - def getVersionString(cls): - return '2.1 (2015-01-23)' - #getVersionString() - - - def download(self, options, path): - # download the latest source files - self.downloadFilesFromHTTP('www.reactome.org', { - path+'/ReactomePathways.txt' : '/download/current/ReactomePathways.txt', - path+'/ReactomePathwaysRelation.txt' : '/download/current/ReactomePathwaysRelation.txt', - path+'/ReactomePathways.gmt.zip' : '/download/current/ReactomePathways.gmt.zip', - path+'/UniProt2Reactome.txt' : '/download/current/UniProt2Reactome.txt', - path+'/Ensembl2Reactome.txt' : '/download/current/Ensembl2Reactome.txt', - # path+'/homo_sapiens.interactions.txt.gz' : '/download/current/homo_sapiens.interactions.txt.gz', - # path+'/gene_association.reactome' : '/download/current/gene_association.reactome', - }) - - return [ - path+'/ReactomePathways.txt', - path+'/ReactomePathwaysRelation.txt', - path+'/ReactomePathways.gmt.zip', - path+'/UniProt2Reactome.txt', - path+'/Ensembl2Reactome.txt' - ] - #download() - - - def update(self, options, path): - # clear out all old data from this source - self.log("deleting old records from the database ...\n") - self.deleteAll() - self.log("deleting old records from the database completed\n") - - # get or create the required metadata records - namespaceID = self.addNamespaces([ - ('symbol', 0), - ('entrez_gid', 0), - ('ensembl_gid', 0), - ('ensembl_pid', 1), - ('uniprot_pid', 1), - ('pathway', 0), - ('reactome_id', 0), - ]) - relationshipID = self.addRelationships([ - ('',), - ]) - typeID = self.addTypes([ - ('gene',), - ('pathway',), - ]) - subtypeID = self.addSubtypes([ - ('-',), - ]) - - # initialize storage - numPath = 0 - reactPath = dict() - pathReact = dict() - listRelationships = list() - numAssoc = 0 - nsAssoc = { - 'symbol' : { 'path':set(), 'react':set() }, - 'entrez_gid' : { 'path':set(), 'react':set() }, - 'ensembl_gid' : { 'path':set(), 'react':set() }, - 'ensembl_pid' : { 'path':set(), 'react':set() }, - 'uniprot_pid' : { 'path':set(), 'react':set() }, - } - - # process pathways - # \t\t - self.log("processing pathways ...\n") - numNewPath = 0 - numMismatch = 0 - with open(path+'/ReactomePathways.txt', 'r') as pathFile: - # no header - for line in pathFile: - words = line.rstrip().split("\t") - if line.startswith('#') or (len(words) < 3) or (words[2] != "Homo sapiens"): - continue - reactID = words[0] - pathway = words[1] - - if reactID not in reactPath: - numNewPath += 1 - reactPath[reactID] = pathway - pathReact[pathway] = reactID - elif reactPath[reactID] != pathway: - numMismatch += 1 - #for line in pathFile - #with pathFile - self.log("processing pathways completed: %d pathways (%d mismatches)\n" % (numNewPath,numMismatch)) - numPath += numNewPath - - # process pathway relationships - # \t - self.log("processing pathway hierarchy ...\n") - numRelations = 0 - with open(path+'/ReactomePathwaysRelation.txt', 'r') as relFile: - # no header - for line in relFile: - words = line.rstrip().split("\t") - if line.startswith('#') or (len(words) < 2): - continue - - numRelations += 1 - listRelationships.append( (words[0],words[1]) ) - #with relFile - self.log("processing pathway hierarchy completed: %d relationships\n" % (numRelations,)) - - # process gene sets - # \t"Reactome Pathway"\t\t... - self.log("verifying gene set archive ...\n") - numNewPath = 0 - numNewAssoc = 0 - with zipfile.ZipFile(path+'/ReactomePathways.gmt.zip','r') as geneZip: - err = geneZip.testzip() - if err: - self.log(" ERROR\n") - raise Exception("CRC failed for %s\n" % err) - self.log("verifying gene set archive completed\n") - self.log("processing gene sets ...\n") - for info in geneZip.infolist(): - # there should be only one file in the archive, but just in case.. - if info.filename == 'ReactomePathways.gmt': - geneFile = geneZip.open(info,'r') - for line in geneFile: - words = line.decode('latin-1').rstrip().split("\t") - if line.decode().startswith('#') or (len(words) < 3) or (words[1] != "Reactome Pathway"): - continue - pathway = words[0] - - if pathway not in pathReact: - numPath += 1 - numNewPath += 1 - reactID = "REACT_unknown_%d" % (numPath,) - pathReact[pathway] = reactID - reactPath[reactID] = pathway - - for n in range(2, len(words)): - numAssoc += 1 - numNewAssoc += 1 - nsAssoc['symbol']['path'].add( (pathway,numAssoc,words[n]) ) - #foreach gene symbol - #foreach line in geneFile - geneFile.close() - #if file ok - #foreach file in geneZip - self.log("processing gene sets completed: %d associations (%d new pathways)\n" % (numNewAssoc,numNewPath)) - #with geneZip - - # TODO: ChEBI or miRBase mappings? - - # process ensembl mappings (to lowest reactome pathway, not parents) - # http://www.reactome.org/download/mapping.README.txt - # \t\t\t\t\t - self.log("processing ensembl associations ...\n") - numNewPath = 0 - numMismatch = 0 - numNewAssoc = 0 - with open(path+'/Ensembl2Reactome.txt', 'r') as assocFile: - for line in assocFile: - words = line.rstrip().split("\t") - if line.startswith('#') or (len(words) < 6) or (words[5] != "Homo sapiens"): - continue - ensemblID = words[0] - reactID = words[1] - pathway = words[3] - - if ensemblID.startswith('ENSG'): - ns = 'ensembl_gid' - elif ensemblID.startswith('ENSP'): - ns = 'ensembl_pid' - else: - continue - - if reactID not in reactPath: - numPath += 1 - numNewPath += 1 - reactPath[reactID] = pathway - pathReact[pathway] = reactID - elif reactPath[reactID] != pathway: - numMismatch += 1 - continue - - numAssoc += 1 - numNewAssoc += 1 - nsAssoc[ns]['path'].add( (pathway,numAssoc,ensemblID) ) - #foreach line in assocFile - #with assocFile - self.log("processing ensembl associations completed: %d associations (%d new pathways, %d mismatches)\n" % (numNewAssoc,numNewPath,numMismatch)) - - # process uniprot mappings (to lowest reactome pathway, not parents) - # http://www.reactome.org/download/mapping.README.txt - # \t\t\t\t\t - self.log("processing uniprot associations ...\n") - numNewPath = 0 - numMismatch = 0 - numNewAssoc = 0 - with open(path+'/UniProt2Reactome.txt', 'r') as assocFile: - for line in assocFile: - words = line.rstrip().split("\t") - if line.startswith('#') or (len(words) < 6) or (words[5] != "Homo sapiens"): - continue - uniprotPID = words[0] - reactID = words[1] - pathway = words[3] - - if reactID not in reactPath: - numPath += 1 - numNewPath += 1 - reactPath[reactID] = pathway - pathReact[pathway] = reactID - elif reactPath[reactID] != pathway: - numMismatch += 1 - continue - - numAssoc += 1 - numNewAssoc += 1 - nsAssoc['uniprot_pid']['path'].add( (pathway,numAssoc,uniprotPID) ) - #foreach line in assocFile - #with assocFile - self.log("processing uniprot associations completed: %d associations (%d new pathways, %d mismatches)\n" % (numNewAssoc,numNewPath,numMismatch)) - numPath += numNewPath - numAssoc += numNewAssoc - - # TODO: process interaction associations? - - if False: - tally = collections.defaultdict(int) - # http://www.reactome.org/download/interactions.README.txt - # \t\t\t\t\t\t\t["<->"]\t - self.log("processing protein interactions ...\n") - numNewPath = 0 - numNewAssoc = 0 - iaFile = self.zfile(path+'/homo_sapiens.interactions.txt.gz') #TODO:context manager,iterator - for line in iaFile: - words = line.decode('latin-1').rstrip().split("\t") - if line.decode().startswith('#') or (len(words) < 8): - continue - uniprotP1 = words[0][8:] if words[0].startswith('UniProt:') else None - ensemblG1 = words[1][8:] if words[1].startswith('ENSEMBL:ENSG') else None - ensemblP1 = words[1][8:] if words[1].startswith('ENSEMBL:ENSP') else None - entrezG1 = words[2][12:] if words[2].startswith('Entrez Gene:') else None - uniprotP2 = words[3][8:] if words[3].startswith('UniProt:') else None - ensemblG2 = words[4][8:] if words[4].startswith('ENSEMBL:ENSG') else None - ensemblP2 = words[4][8:] if words[4].startswith('ENSEMBL:ENSP') else None - entrezG2 = words[5][12:] if words[5].startswith('Entrez Gene:') else None - reacttype = words[6] - reactIDs = words[7].split('<->') - reactID1 = reactIDs[0].split('.',1)[0] - reactID2 = reactIDs[1].split('.',1)[0] if (len(reactIDs) > 1) else None - reactID2 = reactID2 if (reactID2 != reactID1) else None - - # if reacttype is "direct_complex" or "indirect_complex", - # the interactors are in the same group (or supergroup) - # and only one REACTOME pathway will be given in column 8; - # if reacttype is "reaction" or "neighbouring_reaction", - # they are not in the same group but interact anyway, but - # there will still be only one pathway named in column 8 - # for "reaction" (and two for "neighboring_reaction") - - if (not reactID1): - tally['no react 1'] += 1 - elif (reactID1 not in reactPath): - tally['no such react 1'] += 1 - else: - in1 = of1 = 0 - if (uniprotP1): - in1 += max(((alias == uniprotP1) and (pathway == reactPath[reactID1])) for pathway,_,alias in nsAssoc['uniprot_pid']['path']) - of1 += 1 - if (ensemblG1): - in1 += max(((alias == ensemblG1) and (pathway == reactPath[reactID1])) for pathway,_,alias in nsAssoc['ensembl_gid']['path']) - of1 += 1 - if (ensemblP1): - in1 += max(((alias == ensemblP1) and (pathway == reactPath[reactID1])) for pathway,_,alias in nsAssoc['ensembl_pid']['path']) - of1 += 1 - tally['%d/%d protein 1 in react 1' % (in1,of1)] += 1 - - in2 = of2 = 0 - if (uniprotP2): - in2 += max(((alias == uniprotP2) and (pathway == reactPath[reactID1])) for pathway,_,alias in nsAssoc['uniprot_pid']['path']) - of2 += 1 - if (ensemblG2): - in2 += max(((alias == ensemblG2) and (pathway == reactPath[reactID1])) for pathway,_,alias in nsAssoc['ensembl_gid']['path']) - of2 += 1 - if (ensemblP2): - in2 += max(((alias == ensemblP2) and (pathway == reactPath[reactID1])) for pathway,_,alias in nsAssoc['ensembl_pid']['path']) - of2 += 1 - tally['%d/%d protein 2 in react 1' % (in2,of2)] += 1 - #if reactID1 - - if (not reactID2): - tally['no react 2'] += 1 - elif (reactID2 not in reactPath): - tally['no such react 2'] += 1 - else: - in1 = of1 = 0 - if (uniprotP1): - in1 += max(((alias == uniprotP1) and (pathway == reactPath[reactID2])) for pathway,_,alias in nsAssoc['uniprot_pid']['path']) - of1 += 1 - if (ensemblG1): - in1 += max(((alias == ensemblG1) and (pathway == reactPath[reactID2])) for pathway,_,alias in nsAssoc['ensembl_gid']['path']) - of1 += 1 - if (ensemblP1): - in1 += max(((alias == ensemblP1) and (pathway == reactPath[reactID2])) for pathway,_,alias in nsAssoc['ensembl_pid']['path']) - of1 += 1 - tally['%d/%d protein 1 in react 1' % (in1,of1)] += 1 - - in2 = of2 = 0 - if (uniprotP2): - in2 += max(((alias == uniprotP2) and (pathway == reactPath[reactID2])) for pathway,_,alias in nsAssoc['uniprot_pid']['path']) - of2 += 1 - if (ensemblG2): - in2 += max(((alias == ensemblG2) and (pathway == reactPath[reactID2])) for pathway,_,alias in nsAssoc['ensembl_gid']['path']) - of2 += 1 - if (ensemblP2): - in2 += max(((alias == ensemblP2) and (pathway == reactPath[reactID2])) for pathway,_,alias in nsAssoc['ensembl_pid']['path']) - of2 += 1 - tally['%d/%d protein 2 in react 1' % (in2,of2)] += 1 - #if reactID1 - #foreach line in iaFile - self.log("processing protein interactions completed: %d associations (%d new pathways)\n" % (numNewAssoc,numNewPath)) - numPath += numNewPath - numAssoc += numNewAssoc - for k,v in tally.items(): - print(k, v) - #TODO - - # store pathways - self.log("writing pathways to the database ...\n") - listReact = list(reactPath.keys()) - listGID = self.addTypedGroups(typeID['pathway'], ((subtypeID['-'], reactID, reactPath[reactID]) for reactID in listReact)) - reactGID = dict(zip(listReact, listGID)) - self.log("writing pathways to the database completed\n") - - # store pathway names - self.log("writing pathway names to the database ...\n") - self.addGroupNamespacedNames(namespaceID['reactome_id'], ((gid,reactID) for reactID,gid in reactGID.items())) - self.addGroupNamespacedNames(namespaceID['pathway'], ((gid,reactPath[reactID]) for reactID,gid in reactGID.items())) - self.log("writing pathway names to the database completed\n") - - # store pathway relationships - self.log("writing pathway relationships to the database ...\n") - self.addGroupParentRelationships( (reactGID[parentID],reactGID[childID],relationshipID['']) for parentID,childID in listRelationships if ((parentID in reactGID) and (childID in reactGID)) ) - self.log("writing pathway relationships to the database completed\n") - - # store gene associations - self.log("writing gene associations to the database ...\n") - for ns in nsAssoc: - self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID[ns], ((reactGID[reactID],num,name) for reactID,num,name in nsAssoc[ns]['react'])) - self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID[ns], ((reactGID[pathReact[path]],num,name) for path,num,name in nsAssoc[ns]['path'])) - self.log("writing gene associations to the database completed\n") - #update() - -#Source_reactome + + @classmethod + def getVersionString(cls): + return "2.1 (2015-01-23)" + + # getVersionString() + + def download(self, options, path): + # download the latest source files + self.downloadFilesFromHTTP( + "www.reactome.org", + { + path + + "/ReactomePathways.txt": "/download/current/ReactomePathways.txt", + path + + "/ReactomePathwaysRelation.txt": "/download/current/ReactomePathwaysRelation.txt", + path + + "/ReactomePathways.gmt.zip": "/download/current/ReactomePathways.gmt.zip", + path + + "/UniProt2Reactome.txt": "/download/current/UniProt2Reactome.txt", + path + + "/Ensembl2Reactome.txt": "/download/current/Ensembl2Reactome.txt", + # path+'/homo_sapiens.interactions.txt.gz' : '/download/current/homo_sapiens.interactions.txt.gz', + # path+'/gene_association.reactome' : '/download/current/gene_association.reactome', + }, + ) + + return [ + path + "/ReactomePathways.txt", + path + "/ReactomePathwaysRelation.txt", + path + "/ReactomePathways.gmt.zip", + path + "/UniProt2Reactome.txt", + path + "/Ensembl2Reactome.txt", + ] + + # download() + + def update(self, options, path): + # clear out all old data from this source + self.log("deleting old records from the database ...\n") + self.deleteAll() + self.log("deleting old records from the database completed\n") + + # get or create the required metadata records + namespaceID = self.addNamespaces( + [ + ("symbol", 0), + ("entrez_gid", 0), + ("ensembl_gid", 0), + ("ensembl_pid", 1), + ("uniprot_pid", 1), + ("pathway", 0), + ("reactome_id", 0), + ] + ) + relationshipID = self.addRelationships( + [ + ("",), + ] + ) + typeID = self.addTypes( + [ + ("gene",), + ("pathway",), + ] + ) + subtypeID = self.addSubtypes( + [ + ("-",), + ] + ) + + # initialize storage + numPath = 0 + reactPath = dict() + pathReact = dict() + listRelationships = list() + numAssoc = 0 + nsAssoc = { + "symbol": {"path": set(), "react": set()}, + "entrez_gid": {"path": set(), "react": set()}, + "ensembl_gid": {"path": set(), "react": set()}, + "ensembl_pid": {"path": set(), "react": set()}, + "uniprot_pid": {"path": set(), "react": set()}, + } + + # process pathways + # \t\t + self.log("processing pathways ...\n") + numNewPath = 0 + numMismatch = 0 + with open(path + "/ReactomePathways.txt", "r") as pathFile: + # no header + for line in pathFile: + words = line.rstrip().split("\t") + if ( + line.startswith("#") + or (len(words) < 3) + or (words[2] != "Homo sapiens") + ): + continue + reactID = words[0] + pathway = words[1] + + if reactID not in reactPath: + numNewPath += 1 + reactPath[reactID] = pathway + pathReact[pathway] = reactID + elif reactPath[reactID] != pathway: + numMismatch += 1 + # for line in pathFile + # with pathFile + self.log( + "processing pathways completed: %d pathways (%d mismatches)\n" + % (numNewPath, numMismatch) + ) + numPath += numNewPath + + # process pathway relationships + # \t + self.log("processing pathway hierarchy ...\n") + numRelations = 0 + with open(path + "/ReactomePathwaysRelation.txt", "r") as relFile: + # no header + for line in relFile: + words = line.rstrip().split("\t") + if line.startswith("#") or (len(words) < 2): + continue + + numRelations += 1 + listRelationships.append((words[0], words[1])) + # with relFile + self.log( + "processing pathway hierarchy completed: %d relationships\n" + % (numRelations,) + ) + + # process gene sets + # \t"Reactome Pathway"\t\t... + self.log("verifying gene set archive ...\n") + numNewPath = 0 + numNewAssoc = 0 + with zipfile.ZipFile(path + "/ReactomePathways.gmt.zip", "r") as geneZip: + err = geneZip.testzip() + if err: + self.log(" ERROR\n") + raise Exception("CRC failed for %s\n" % err) + self.log("verifying gene set archive completed\n") + self.log("processing gene sets ...\n") + for info in geneZip.infolist(): + # there should be only one file in the archive, but just in case.. + if info.filename == "ReactomePathways.gmt": + geneFile = geneZip.open(info, "r") + for line in geneFile: + words = line.decode("latin-1").rstrip().split("\t") + if ( + line.decode().startswith("#") + or (len(words) < 3) + or (words[1] != "Reactome Pathway") + ): + continue + pathway = words[0] + + if pathway not in pathReact: + numPath += 1 + numNewPath += 1 + reactID = "REACT_unknown_%d" % (numPath,) + pathReact[pathway] = reactID + reactPath[reactID] = pathway + + for n in range(2, len(words)): + numAssoc += 1 + numNewAssoc += 1 + nsAssoc["symbol"]["path"].add((pathway, numAssoc, words[n])) + # foreach gene symbol + # foreach line in geneFile + geneFile.close() + # if file ok + # foreach file in geneZip + self.log( + "processing gene sets completed: %d associations (%d new pathways)\n" + % (numNewAssoc, numNewPath) + ) + # with geneZip + + # TODO: ChEBI or miRBase mappings? + + # process ensembl mappings (to lowest reactome pathway, not parents) + # http://www.reactome.org/download/mapping.README.txt + # \t\t\t\t\t + self.log("processing ensembl associations ...\n") + numNewPath = 0 + numMismatch = 0 + numNewAssoc = 0 + with open(path + "/Ensembl2Reactome.txt", "r") as assocFile: + for line in assocFile: + words = line.rstrip().split("\t") + if ( + line.startswith("#") + or (len(words) < 6) + or (words[5] != "Homo sapiens") + ): + continue + ensemblID = words[0] + reactID = words[1] + pathway = words[3] + + if ensemblID.startswith("ENSG"): + ns = "ensembl_gid" + elif ensemblID.startswith("ENSP"): + ns = "ensembl_pid" + else: + continue + + if reactID not in reactPath: + numPath += 1 + numNewPath += 1 + reactPath[reactID] = pathway + pathReact[pathway] = reactID + elif reactPath[reactID] != pathway: + numMismatch += 1 + continue + + numAssoc += 1 + numNewAssoc += 1 + nsAssoc[ns]["path"].add((pathway, numAssoc, ensemblID)) + # foreach line in assocFile + # with assocFile + self.log( + "processing ensembl associations completed: %d associations (%d new pathways, %d mismatches)\n" + % (numNewAssoc, numNewPath, numMismatch) + ) + + # process uniprot mappings (to lowest reactome pathway, not parents) + # http://www.reactome.org/download/mapping.README.txt + # \t\t\t\t\t + self.log("processing uniprot associations ...\n") + numNewPath = 0 + numMismatch = 0 + numNewAssoc = 0 + with open(path + "/UniProt2Reactome.txt", "r") as assocFile: + for line in assocFile: + words = line.rstrip().split("\t") + if ( + line.startswith("#") + or (len(words) < 6) + or (words[5] != "Homo sapiens") + ): + continue + uniprotPID = words[0] + reactID = words[1] + pathway = words[3] + + if reactID not in reactPath: + numPath += 1 + numNewPath += 1 + reactPath[reactID] = pathway + pathReact[pathway] = reactID + elif reactPath[reactID] != pathway: + numMismatch += 1 + continue + + numAssoc += 1 + numNewAssoc += 1 + nsAssoc["uniprot_pid"]["path"].add((pathway, numAssoc, uniprotPID)) + # foreach line in assocFile + # with assocFile + self.log( + "processing uniprot associations completed: %d associations (%d new pathways, %d mismatches)\n" + % (numNewAssoc, numNewPath, numMismatch) + ) + numPath += numNewPath + numAssoc += numNewAssoc + + # TODO: process interaction associations? + + if False: + tally = collections.defaultdict(int) + # http://www.reactome.org/download/interactions.README.txt + # \t\t\t\t\t\t\t["<->"]\t + self.log("processing protein interactions ...\n") + numNewPath = 0 + numNewAssoc = 0 + iaFile = self.zfile( + path + "/homo_sapiens.interactions.txt.gz" + ) # TODO:context manager,iterator + for line in iaFile: + words = line.decode("latin-1").rstrip().split("\t") + if line.decode().startswith("#") or (len(words) < 8): + continue + uniprotP1 = words[0][8:] if words[0].startswith("UniProt:") else None + ensemblG1 = ( + words[1][8:] if words[1].startswith("ENSEMBL:ENSG") else None + ) + ensemblP1 = ( + words[1][8:] if words[1].startswith("ENSEMBL:ENSP") else None + ) + entrezG1 = ( + words[2][12:] if words[2].startswith("Entrez Gene:") else None + ) + uniprotP2 = words[3][8:] if words[3].startswith("UniProt:") else None + ensemblG2 = ( + words[4][8:] if words[4].startswith("ENSEMBL:ENSG") else None + ) + ensemblP2 = ( + words[4][8:] if words[4].startswith("ENSEMBL:ENSP") else None + ) + entrezG2 = ( + words[5][12:] if words[5].startswith("Entrez Gene:") else None + ) + reacttype = words[6] + reactIDs = words[7].split("<->") + reactID1 = reactIDs[0].split(".", 1)[0] + reactID2 = reactIDs[1].split(".", 1)[0] if (len(reactIDs) > 1) else None + reactID2 = reactID2 if (reactID2 != reactID1) else None + + # if reacttype is "direct_complex" or "indirect_complex", + # the interactors are in the same group (or supergroup) + # and only one REACTOME pathway will be given in column 8; + # if reacttype is "reaction" or "neighbouring_reaction", + # they are not in the same group but interact anyway, but + # there will still be only one pathway named in column 8 + # for "reaction" (and two for "neighboring_reaction") + + if not reactID1: + tally["no react 1"] += 1 + elif reactID1 not in reactPath: + tally["no such react 1"] += 1 + else: + in1 = of1 = 0 + if uniprotP1: + in1 += max( + ((alias == uniprotP1) and (pathway == reactPath[reactID1])) + for pathway, _, alias in nsAssoc["uniprot_pid"]["path"] + ) + of1 += 1 + if ensemblG1: + in1 += max( + ((alias == ensemblG1) and (pathway == reactPath[reactID1])) + for pathway, _, alias in nsAssoc["ensembl_gid"]["path"] + ) + of1 += 1 + if ensemblP1: + in1 += max( + ((alias == ensemblP1) and (pathway == reactPath[reactID1])) + for pathway, _, alias in nsAssoc["ensembl_pid"]["path"] + ) + of1 += 1 + tally["%d/%d protein 1 in react 1" % (in1, of1)] += 1 + + in2 = of2 = 0 + if uniprotP2: + in2 += max( + ((alias == uniprotP2) and (pathway == reactPath[reactID1])) + for pathway, _, alias in nsAssoc["uniprot_pid"]["path"] + ) + of2 += 1 + if ensemblG2: + in2 += max( + ((alias == ensemblG2) and (pathway == reactPath[reactID1])) + for pathway, _, alias in nsAssoc["ensembl_gid"]["path"] + ) + of2 += 1 + if ensemblP2: + in2 += max( + ((alias == ensemblP2) and (pathway == reactPath[reactID1])) + for pathway, _, alias in nsAssoc["ensembl_pid"]["path"] + ) + of2 += 1 + tally["%d/%d protein 2 in react 1" % (in2, of2)] += 1 + # if reactID1 + + if not reactID2: + tally["no react 2"] += 1 + elif reactID2 not in reactPath: + tally["no such react 2"] += 1 + else: + in1 = of1 = 0 + if uniprotP1: + in1 += max( + ((alias == uniprotP1) and (pathway == reactPath[reactID2])) + for pathway, _, alias in nsAssoc["uniprot_pid"]["path"] + ) + of1 += 1 + if ensemblG1: + in1 += max( + ((alias == ensemblG1) and (pathway == reactPath[reactID2])) + for pathway, _, alias in nsAssoc["ensembl_gid"]["path"] + ) + of1 += 1 + if ensemblP1: + in1 += max( + ((alias == ensemblP1) and (pathway == reactPath[reactID2])) + for pathway, _, alias in nsAssoc["ensembl_pid"]["path"] + ) + of1 += 1 + tally["%d/%d protein 1 in react 1" % (in1, of1)] += 1 + + in2 = of2 = 0 + if uniprotP2: + in2 += max( + ((alias == uniprotP2) and (pathway == reactPath[reactID2])) + for pathway, _, alias in nsAssoc["uniprot_pid"]["path"] + ) + of2 += 1 + if ensemblG2: + in2 += max( + ((alias == ensemblG2) and (pathway == reactPath[reactID2])) + for pathway, _, alias in nsAssoc["ensembl_gid"]["path"] + ) + of2 += 1 + if ensemblP2: + in2 += max( + ((alias == ensemblP2) and (pathway == reactPath[reactID2])) + for pathway, _, alias in nsAssoc["ensembl_pid"]["path"] + ) + of2 += 1 + tally["%d/%d protein 2 in react 1" % (in2, of2)] += 1 + # if reactID1 + # foreach line in iaFile + self.log( + "processing protein interactions completed: %d associations (%d new pathways)\n" + % (numNewAssoc, numNewPath) + ) + numPath += numNewPath + numAssoc += numNewAssoc + for k, v in tally.items(): + print(k, v) + # TODO + + # store pathways + self.log("writing pathways to the database ...\n") + listReact = list(reactPath.keys()) + listGID = self.addTypedGroups( + typeID["pathway"], + ((subtypeID["-"], reactID, reactPath[reactID]) for reactID in listReact), + ) + reactGID = dict(zip(listReact, listGID)) + self.log("writing pathways to the database completed\n") + + # store pathway names + self.log("writing pathway names to the database ...\n") + self.addGroupNamespacedNames( + namespaceID["reactome_id"], + ((gid, reactID) for reactID, gid in reactGID.items()), + ) + self.addGroupNamespacedNames( + namespaceID["pathway"], + ((gid, reactPath[reactID]) for reactID, gid in reactGID.items()), + ) + self.log("writing pathway names to the database completed\n") + + # store pathway relationships + self.log("writing pathway relationships to the database ...\n") + self.addGroupParentRelationships( + (reactGID[parentID], reactGID[childID], relationshipID[""]) + for parentID, childID in listRelationships + if ((parentID in reactGID) and (childID in reactGID)) + ) + self.log("writing pathway relationships to the database completed\n") + + # store gene associations + self.log("writing gene associations to the database ...\n") + for ns in nsAssoc: + self.addGroupMemberTypedNamespacedNames( + typeID["gene"], + namespaceID[ns], + ( + (reactGID[reactID], num, name) + for reactID, num, name in nsAssoc[ns]["react"] + ), + ) + self.addGroupMemberTypedNamespacedNames( + typeID["gene"], + namespaceID[ns], + ( + (reactGID[pathReact[path]], num, name) + for path, num, name in nsAssoc[ns]["path"] + ), + ) + self.log("writing gene associations to the database completed\n") + + # update() + + +# Source_reactome diff --git a/loki/loaders/loki_source_ucsc_ecr.py b/loki/loaders/loki_source_ucsc_ecr.py index 538aea9..2585912 100644 --- a/loki/loaders/loki_source_ucsc_ecr.py +++ b/loki/loaders/loki_source_ucsc_ecr.py @@ -1,424 +1,564 @@ #!/usr/bin/env python -#import collections +# import collections import itertools from threading import Thread from loki import loki_source class Source_ucsc_ecr(loki_source.Source): - """ - A class to load the pairwise alignments between species as ECRs from the - UCSC inter-species alignments - """ - - - ################################################## - # private class data - - - _chmList = ('1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','X','Y','M') - _comparisons = {"vertebrate":"", "placentalMammals":"placental." , "primates":"primates." } - chr_grp_ids = [] - - - ################################################## - # source interface - - - @classmethod - def getVersionString(cls): - return '2.0.1 (2013-03-01)' - #getVersionString() - - - @classmethod - def getOptions(cls): - return { - 'size' : 'minimum length of an ECR in bases (default: 100)', - 'identity' : 'minimum identity of an ECR (default: 0.7)', - 'gap' : 'maximum gap length below the identity threshold (default: 50)' - } - #getOptions() - - - def validateOptions(self, options): - """ - Validate the options - """ - for o,v in options.items(): - try: - if o == 'size': - v = int(v) - elif o == 'identity': - v = float(v) - elif o == 'gap': - v = int(v) - elif o == 'reverse': #undocumented debug option - v = v.lower() - if (v == '0') or 'false'.startswith(v) or 'no'.startswith(v): - v = False - elif (v == '1') or 'true'.startswith(v) or 'yes'.startswith(v): - v = True - else: - return "must be 0/false/no or 1/true/yes" - else: - return "unknown option '%s'" % o - except ValueError: - return "Cannot parse '%s' parameter value - given '%s'" % (o,v) - options[o] = v - #foreach option - return True - #validateOptions() - - - def download(self, options, path): - """ - Download the files - """ - remFiles = dict() - for chm in self._chmList: - for (d,f) in self._comparisons.items(): - remFiles[path+'/'+d+'.chr'+chm+'.phastCons.txt.gz'] = '/goldenPath/hg19/phastCons46way/'+d+'/chr'+chm+'.phastCons46way.'+f+'wigFix.gz' -# self.downloadFilesFromFTP('hgdownload.cse.ucsc.edu', remFiles) - self.downloadFilesFromHTTP('hgdownload.cse.ucsc.edu', remFiles) - - return list(remFiles.keys()) - #download() - - - def update(self, options, path): - """ - Load the data from all of the files - UCSC's phastCons files use 1-based coordinates, according to: - http://genome.ucsc.edu/goldenPath/help/phastCons.html - Since this matches LOKI's convention, we can store them as-is. - """ - self.log("deleting old records from the database ...\n") - self.deleteAll() - self.log("deleting old records from the database completed\n") - - # Add a namespace - ecr_ns = self.addNamespace("ucsc_ecr") - - # Add a type of "ecr" - ecr_typeid = self.addType("ecr") - - # Add a type of "ecr_group" - ecr_group_typeid = self.addType("ecr_group") - subtypeID = self.addSubtypes([ - ('-',), - ]) - - # Make sure the '' ldprofile exists - ecr_ldprofile_id = self.addLDProfile('', 'no LD adjustment') - - # Add a containment relationship - rel_id = self.addRelationship("contains") - - for sp in self._comparisons: - self.log("processing ECRs for " + sp + " ...\n") - desc = "ECRs for " + sp - label = "ecr_" + sp - - # Add the group for this species (or comparison) - ecr_gid = self.addTypedGroups(ecr_group_typeid, [(subtypeID['-'], label, desc)])[0] - self.addGroupNamespacedNames(ecr_ns, [(ecr_gid, label)]) - - chr_grp_ids = [] - processThreads = {} - for ch in self._chmList: - processThreads[ch] = Thread(target=self.processECRs, args=(sp, ch, chr_grp_ids, ecr_group_typeid, subtypeID, ecr_ns, ecr_typeid, ecr_ldprofile_id, rel_id, options, path)) - processThreads[ch].start() - - for ch in self._chmList: - processThreads[ch].join() - - self.addGroupRelationships(((ecr_gid, c, rel_id, 1) for c in chr_grp_ids)) - - self.log("processing ECRs for " + sp + " completed\n") - - # store source metadata - self.setSourceBuilds(None, 19) # TODO: check for latest FTP path rather than hardcoded /goldenPath/hg19/phastCons46way/ - #update() - - def processECRs(self, sp, ch, chr_grp_ids, ecr_group_typeid, subtypeID, ecr_ns, ecr_typeid, ecr_ldprofile_id, rel_id, options, path): - ch_id = self._loki.chr_num[ch] - self.log("processing Chromosome " + ch + " ...\n") - f = self.zfile(path+'/'+sp +'.chr'+ ch+'.phastCons.txt.gz') - curr_band = 1 - num_regions = 0 - desc = "ECRs for " + sp + " on Chromosome " + ch - chr_grp_ids.append(self.addTypedGroups(ecr_group_typeid, [(subtypeID['-'], "ecr_%s_chr%s" % (sp, ch), desc)])[0]) - self.addGroupNamespacedNames(ecr_ns, [(chr_grp_ids[-1], "ecr_%s_chr%s" % (sp, ch))]) - band_grps = [] - grp_rid = {} - for regions in self.getRegions(f, options): - label = "ecr_%s_chr%s_band%d" % (sp, ch, curr_band) - desc = "ECRs for " + sp + " on Chromosome " + ch + ", Band %d" % (curr_band,) - num_regions += len(regions) - - if regions: - band_grps.append((subtypeID['-'], label, desc)) - - # Add the region itself - reg_ids = self.addTypedBiopolymers(ecr_typeid, ((self.getRegionName(sp, ch, r), '') for r in regions)) - # Add the name of the region - self.addBiopolymerNamespacedNames(ecr_ns, zip(reg_ids, (self.getRegionName(sp, ch, r) for r in regions))) - # Add the region Boundaries - # This gives a generator that yields [(region_id, (chrom_id, start, stop)) ... ] - region_bound_gen = zip(((i,) for i in reg_ids), ((ch_id, r[0], r[1]) for r in regions)) - self.addBiopolymerLDProfileRegions(ecr_ldprofile_id, (tuple(itertools.chain(*c)) for c in region_bound_gen)) - - if regions: - grp_rid[band_grps[-1]] = reg_ids - #Add the region to the group - #self.addGroupBiopolymers(((band_gids[-1], r_id) for r_id in reg_ids)) - - curr_band += 1 - - - band_gids = self.addTypedGroups(ecr_group_typeid, band_grps) - self.addGroupNamespacedNames(ecr_ns, zip(band_gids, (r[0] for r in band_grps))) - gid_rid = [] - for i in range(len(band_gids)): - gid_rid.extend(((band_gids[i], rid) for rid in grp_rid[band_grps[i]])) - - self.addGroupBiopolymers(gid_rid) - - self.addGroupRelationships(((chr_grp_ids[-1], b, rel_id, 1) for b in band_gids)) - - self.log("processing Chromosome %s completed (%d regions found in %d bands)\n" % (ch, num_regions, curr_band - 1)) - #processECRs() - - def getRegionName(self, species, ch, region): - """ - Returns a string representation of the name - """ - return species + ":chr" + ch + ":" + str(region[0]) + "-" + str(region[1]) - #getRegionName() - - - def getRegions(self, f, options): - # fetch loader options - minSize = options.get('size',100) - minIdent = options.get('identity',0.7) - maxGap = options.get('gap',50) - reverse = options.get('reverse',False) - - # initialize parser state - pos = 1 - step = 1 - state = None - curStart = pos - curSum = 0.0 - curCount = 0 - - # parse the file - segments = list() - regions = list() - EOF = False - while not EOF: - declaration = None - try: - # parsing can be in one of four states, handled in rough order of frequency; - # we could cover all cases in one 'for line in f:' loop, but doing - # extra tests for things that don't change much is ~45% slower - while True: - loopState = state - loopPos = pos - if (state is False) and (curCount > maxGap): - # in a low segment that is already beyond the max gap length - # (so we don't care about sum or count anymore) - for line in f: - v = float(line) - if v >= minIdent: - state = True - break - pos += step - #for line in f - elif (state is False): - # in a low segment which is still within the max gap length - for line in f: - v = float(line) - if v >= minIdent: - state = True - break - curSum += v - curCount += 1 - pos += step - if curCount > maxGap: - break - #for line in f - elif (state is True): - # in a high segment - for line in f: - v = float(line) - if v < minIdent: - state = False - break - curSum += v - curCount += 1 - pos += step - #for line in f - else: - # starting a new segment at top of file or after a data gap - # (we only have to read 1 value to see what kind of segment is starting) - for line in f: - v = float(line) - state = (v >= minIdent) - break - #for line in f - #if - - # since all states have 'for line in f:' loops, we only land here for a few reasons - if loopState != state: - # we changed threshold state; store the segment, reset the counters and continue - segments.append( (curStart,pos-step,curSum,curCount,loopState) ) - curStart = pos - curSum = v - curCount = 1 - pos += step - elif loopPos == pos: - # we hit EOF; store the segment and process the final batch - segments.append( (curStart,pos-step,curSum,curCount,loopState) ) - EOF = True - break - else: - # we exceeded the max gap length in a low segment; process the batch - break - #if - #while True - except ValueError: - declaration = dict( pair.split('=',1) for pair in line.strip().split() if '=' in pair ) - if ('start' not in declaration) or ('step' not in declaration): - raise Exception("ERROR: invalid phastcons format: %s" % line) - # if the new band picks right up after the old one, - # ignore it since there was no actual gap in the data - if int(declaration['start']) == pos: - step = int(declaration['step']) - continue - # store the segment - segments.append( (curStart,pos-step,curSum,curCount,state) ) - #try/ValueError - - # invert segments if requested - if reverse: - for s in range(0,len(segments)): - segments[s] = (-segments[s][1],-segments[s][0]) + segments[s][2:] - segments.reverse() - tmpregions = regions - regions = list() - - # set min/max segment indecies to skip leading or trailing low or invalid segments - sn,sx = 0,len(segments)-1 - while (sn <= sx) and (segments[sn][4] is not True): - sn += 1 - while (sn <= sx) and (segments[sx][4] is not True): - sx -= 1 - #assert ((sn > sx) or ((sx-sn+1)%2)), "segment list size cannot be even (must be hi , hi-lo-hi , etc)" - - # merge applicable high segments according to some metric - if 0: # running-average metric with minSize bugs (original algorithm) - while sn <= sx: - s0,s1 = sn,sn - while (s1 < sx) and ((sum(segments[s][2] for s in range(s0,s1+2)) / sum(segments[s][3] for s in range(s0,s1+2))) >= minIdent): - s1 += 2 - if s1 == sx: - if (segments[s1][1] - segments[s0][0]) > minSize: - regions.append( (segments[s0][0],segments[s1][1]) ) - elif (segments[s1][1] - segments[s0][0]) >= minSize: - regions.append( (segments[s0][0],segments[s1][1]) ) - sn = s1+2 - #while segments to process - elif 1: # running-average metric - while sn <= sx: - s0,s1 = sn,sn - while (s1 < sx) and ((sum(segments[s][2] for s in range(s0,s1+2)) / sum(segments[s][3] for s in range(s0,s1+2))) >= minIdent): - s1 += 2 - if (segments[s1][1] - segments[s0][0] + 1) >= minSize: - regions.append( (segments[s0][0],segments[s1][1]) ) - sn = s1+2 - #while segments to process - elif 0: # potential-average metric - while sn <= sx: - s0,s1 = sn,sn - while (s1 < sx) and ((sum(segments[s][2] for s in range(s0,s1+3)) / sum(segments[s][3] for s in range(s0,s1+3))) >= minIdent): - s1 += 2 - if (segments[s1][1] - segments[s0][0] + 1) >= minSize: - regions.append( (segments[s0][0],segments[s1][1]) ) - sn = s1+2 - #while segments to process - elif 0: # drop-worst metric v1 - partitions = [(sn,sx)] if (sn <= sx) else None - while partitions: - sn,sx = partitions.pop() - s0,s1 = sn,sx - while (s0 < s1) and ((sum(segments[s][2] for s in range(s0,s1+1)) / sum(segments[s][3] for s in range(s0,s1+1))) < minIdent): - sw = [s1-1] - for s in range(s1-3,s0,-2): - if (segments[s][2]+0.0001) < (segments[sw[0]][2]-0.0001): - sw = [s] - elif (segments[s][2]-0.0001) <= (segments[sw[0]][2]+0.0001): - if segments[s][3] > segments[sw[0]][3]: - sw = [s] - elif segments[s][3] == segments[sw[0]][3]: - sw.append(s) - for s in sw: - partitions.append( (s+1,s1) ) - s1 = s-1 - #while segments need splitting - if (segments[s1][1] - segments[s0][0] + 1) >= minSize: - regions.append( (segments[s0][0],segments[s1][1]) ) - #while segments to process - elif 0: # drop-worst metric v2 - partitions = [(sn,sx)] if (sn <= sx) else None - while partitions: - sn,sx = partitions.pop() - s0,s1 = sn,sx - while (s0 < s1) and ((sum(segments[s][2] for s in range(s0,s1+1)) / sum(segments[s][3] for s in range(s0,s1+1))) < minIdent): - sw = [s1-1] - for s in range(s1-3,s0,-2): - if (minIdent*segments[s][3]-segments[s][2]-0.0001) > (minIdent*segments[sw[0]][3]-segments[sw[0]][2]+0.0001): - sw = [s] - elif (minIdent*segments[s][3]-segments[s][2]+0.0001) >= (minIdent*segments[sw[0]][3]-segments[sw[0]][2]-0.0001): - if segments[s][3] > segments[sw[0]][3]: - sw = [s] - elif segments[s][3] == segments[sw[0]][3]: - sw.append(s) - for s in sw: - partitions.append( (s+1,s1) ) - s1 = s-1 - #while segments need splitting - if (segments[s1][1] - segments[s0][0] + 1) >= minSize: - regions.append( (segments[s0][0],segments[s1][1]) ) - #while segments to process - else: - raise Exception("ERROR: no segment merge metrics are enabled") - #if metric - segments = list() - - # re-invert results if necessary - if reverse: - for r in range(len(regions)-1,-1,-1): - tmpregions.append( (-regions[r][1],-regions[r][0]) ) - regions = tmpregions - tmpregions = None - - # if we hit a declaration line or EOF, yield this band's regions - if (declaration or EOF) and regions: - yield regions - regions = list() - - # if we hit a declaration line but not EOF, reset the parser state - if declaration: - pos = int(declaration['start']) - step = int(declaration['step']) - state = None - curStart = pos - curSum = 0.0 - curCount = 0 - #while not EOF - #getRegions() - -#Source_ucsc_ecr + """ + A class to load the pairwise alignments between species as ECRs from the + UCSC inter-species alignments + """ + + ################################################## + # private class data + + _chmList = ( + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "X", + "Y", + "M", + ) + _comparisons = { + "vertebrate": "", + "placentalMammals": "placental.", + "primates": "primates.", + } + chr_grp_ids = [] + + ################################################## + # source interface + + @classmethod + def getVersionString(cls): + return "2.0.1 (2013-03-01)" + + # getVersionString() + + @classmethod + def getOptions(cls): + return { + "size": "minimum length of an ECR in bases (default: 100)", + "identity": "minimum identity of an ECR (default: 0.7)", + "gap": "maximum gap length below the identity threshold (default: 50)", + } + + # getOptions() + + def validateOptions(self, options): + """ + Validate the options + """ + for o, v in options.items(): + try: + if o == "size": + v = int(v) + elif o == "identity": + v = float(v) + elif o == "gap": + v = int(v) + elif o == "reverse": # undocumented debug option + v = v.lower() + if (v == "0") or "false".startswith(v) or "no".startswith(v): + v = False + elif (v == "1") or "true".startswith(v) or "yes".startswith(v): + v = True + else: + return "must be 0/false/no or 1/true/yes" + else: + return "unknown option '%s'" % o + except ValueError: + return "Cannot parse '%s' parameter value - given '%s'" % (o, v) + options[o] = v + # foreach option + return True + + # validateOptions() + + def download(self, options, path): + """ + Download the files + """ + remFiles = dict() + for chm in self._chmList: + for d, f in self._comparisons.items(): + remFiles[path + "/" + d + ".chr" + chm + ".phastCons.txt.gz"] = ( + "/goldenPath/hg19/phastCons46way/" + + d + + "/chr" + + chm + + ".phastCons46way." + + f + + "wigFix.gz" + ) + # self.downloadFilesFromFTP('hgdownload.cse.ucsc.edu', remFiles) + self.downloadFilesFromHTTP("hgdownload.cse.ucsc.edu", remFiles) + + return list(remFiles.keys()) + + # download() + + def update(self, options, path): + """ + Load the data from all of the files + UCSC's phastCons files use 1-based coordinates, according to: + http://genome.ucsc.edu/goldenPath/help/phastCons.html + Since this matches LOKI's convention, we can store them as-is. + """ + self.log("deleting old records from the database ...\n") + self.deleteAll() + self.log("deleting old records from the database completed\n") + + # Add a namespace + ecr_ns = self.addNamespace("ucsc_ecr") + + # Add a type of "ecr" + ecr_typeid = self.addType("ecr") + + # Add a type of "ecr_group" + ecr_group_typeid = self.addType("ecr_group") + subtypeID = self.addSubtypes( + [ + ("-",), + ] + ) + + # Make sure the '' ldprofile exists + ecr_ldprofile_id = self.addLDProfile("", "no LD adjustment") + + # Add a containment relationship + rel_id = self.addRelationship("contains") + + for sp in self._comparisons: + self.log("processing ECRs for " + sp + " ...\n") + desc = "ECRs for " + sp + label = "ecr_" + sp + + # Add the group for this species (or comparison) + ecr_gid = self.addTypedGroups( + ecr_group_typeid, [(subtypeID["-"], label, desc)] + )[0] + self.addGroupNamespacedNames(ecr_ns, [(ecr_gid, label)]) + + chr_grp_ids = [] + processThreads = {} + for ch in self._chmList: + processThreads[ch] = Thread( + target=self.processECRs, + args=( + sp, + ch, + chr_grp_ids, + ecr_group_typeid, + subtypeID, + ecr_ns, + ecr_typeid, + ecr_ldprofile_id, + rel_id, + options, + path, + ), + ) + processThreads[ch].start() + + for ch in self._chmList: + processThreads[ch].join() + + self.addGroupRelationships(((ecr_gid, c, rel_id, 1) for c in chr_grp_ids)) + + self.log("processing ECRs for " + sp + " completed\n") + + # store source metadata + self.setSourceBuilds( + None, 19 + ) # TODO: check for latest FTP path rather than hardcoded /goldenPath/hg19/phastCons46way/ + + # update() + + def processECRs( + self, + sp, + ch, + chr_grp_ids, + ecr_group_typeid, + subtypeID, + ecr_ns, + ecr_typeid, + ecr_ldprofile_id, + rel_id, + options, + path, + ): + ch_id = self._loki.chr_num[ch] + self.log("processing Chromosome " + ch + " ...\n") + f = self.zfile(path + "/" + sp + ".chr" + ch + ".phastCons.txt.gz") + curr_band = 1 + num_regions = 0 + desc = "ECRs for " + sp + " on Chromosome " + ch + chr_grp_ids.append( + self.addTypedGroups( + ecr_group_typeid, [(subtypeID["-"], "ecr_%s_chr%s" % (sp, ch), desc)] + )[0] + ) + self.addGroupNamespacedNames( + ecr_ns, [(chr_grp_ids[-1], "ecr_%s_chr%s" % (sp, ch))] + ) + band_grps = [] + grp_rid = {} + for regions in self.getRegions(f, options): + label = "ecr_%s_chr%s_band%d" % (sp, ch, curr_band) + desc = ( + "ECRs for " + sp + " on Chromosome " + ch + ", Band %d" % (curr_band,) + ) + num_regions += len(regions) + + if regions: + band_grps.append((subtypeID["-"], label, desc)) + + # Add the region itself + reg_ids = self.addTypedBiopolymers( + ecr_typeid, ((self.getRegionName(sp, ch, r), "") for r in regions) + ) + # Add the name of the region + self.addBiopolymerNamespacedNames( + ecr_ns, zip(reg_ids, (self.getRegionName(sp, ch, r) for r in regions)) + ) + # Add the region Boundaries + # This gives a generator that yields [(region_id, (chrom_id, start, stop)) ... ] + region_bound_gen = zip( + ((i,) for i in reg_ids), ((ch_id, r[0], r[1]) for r in regions) + ) + self.addBiopolymerLDProfileRegions( + ecr_ldprofile_id, (tuple(itertools.chain(*c)) for c in region_bound_gen) + ) + + if regions: + grp_rid[band_grps[-1]] = reg_ids + # Add the region to the group + # self.addGroupBiopolymers(((band_gids[-1], r_id) for r_id in reg_ids)) + + curr_band += 1 + + band_gids = self.addTypedGroups(ecr_group_typeid, band_grps) + self.addGroupNamespacedNames(ecr_ns, zip(band_gids, (r[0] for r in band_grps))) + gid_rid = [] + for i in range(len(band_gids)): + gid_rid.extend(((band_gids[i], rid) for rid in grp_rid[band_grps[i]])) + + self.addGroupBiopolymers(gid_rid) + + self.addGroupRelationships(((chr_grp_ids[-1], b, rel_id, 1) for b in band_gids)) + + self.log( + "processing Chromosome %s completed (%d regions found in %d bands)\n" + % (ch, num_regions, curr_band - 1) + ) + + # processECRs() + + def getRegionName(self, species, ch, region): + """ + Returns a string representation of the name + """ + return species + ":chr" + ch + ":" + str(region[0]) + "-" + str(region[1]) + + # getRegionName() + + def getRegions(self, f, options): + # fetch loader options + minSize = options.get("size", 100) + minIdent = options.get("identity", 0.7) + maxGap = options.get("gap", 50) + reverse = options.get("reverse", False) + + # initialize parser state + pos = 1 + step = 1 + state = None + curStart = pos + curSum = 0.0 + curCount = 0 + + # parse the file + segments = list() + regions = list() + EOF = False + while not EOF: + declaration = None + try: + # parsing can be in one of four states, handled in rough order of frequency; + # we could cover all cases in one 'for line in f:' loop, but doing + # extra tests for things that don't change much is ~45% slower + while True: + loopState = state + loopPos = pos + if (state is False) and (curCount > maxGap): + # in a low segment that is already beyond the max gap length + # (so we don't care about sum or count anymore) + for line in f: + v = float(line) + if v >= minIdent: + state = True + break + pos += step + # for line in f + elif state is False: + # in a low segment which is still within the max gap length + for line in f: + v = float(line) + if v >= minIdent: + state = True + break + curSum += v + curCount += 1 + pos += step + if curCount > maxGap: + break + # for line in f + elif state is True: + # in a high segment + for line in f: + v = float(line) + if v < minIdent: + state = False + break + curSum += v + curCount += 1 + pos += step + # for line in f + else: + # starting a new segment at top of file or after a data gap + # (we only have to read 1 value to see what kind of segment is starting) + for line in f: + v = float(line) + state = v >= minIdent + break + # for line in f + # if + + # since all states have 'for line in f:' loops, we only land here for a few reasons + if loopState != state: + # we changed threshold state; store the segment, reset the counters and continue + segments.append( + (curStart, pos - step, curSum, curCount, loopState) + ) + curStart = pos + curSum = v + curCount = 1 + pos += step + elif loopPos == pos: + # we hit EOF; store the segment and process the final batch + segments.append( + (curStart, pos - step, curSum, curCount, loopState) + ) + EOF = True + break + else: + # we exceeded the max gap length in a low segment; process the batch + break + # if + # while True + except ValueError: + declaration = dict( + pair.split("=", 1) for pair in line.strip().split() if "=" in pair + ) + if ("start" not in declaration) or ("step" not in declaration): + raise Exception("ERROR: invalid phastcons format: %s" % line) + # if the new band picks right up after the old one, + # ignore it since there was no actual gap in the data + if int(declaration["start"]) == pos: + step = int(declaration["step"]) + continue + # store the segment + segments.append((curStart, pos - step, curSum, curCount, state)) + # try/ValueError + + # invert segments if requested + if reverse: + for s in range(0, len(segments)): + segments[s] = (-segments[s][1], -segments[s][0]) + segments[s][2:] + segments.reverse() + tmpregions = regions + regions = list() + + # set min/max segment indecies to skip leading or trailing low or invalid segments + sn, sx = 0, len(segments) - 1 + while (sn <= sx) and (segments[sn][4] is not True): + sn += 1 + while (sn <= sx) and (segments[sx][4] is not True): + sx -= 1 + # assert ((sn > sx) or ((sx-sn+1)%2)), "segment list size cannot be even (must be hi , hi-lo-hi , etc)" + + # merge applicable high segments according to some metric + if 0: # running-average metric with minSize bugs (original algorithm) + while sn <= sx: + s0, s1 = sn, sn + while (s1 < sx) and ( + ( + sum(segments[s][2] for s in range(s0, s1 + 2)) + / sum(segments[s][3] for s in range(s0, s1 + 2)) + ) + >= minIdent + ): + s1 += 2 + if s1 == sx: + if (segments[s1][1] - segments[s0][0]) > minSize: + regions.append((segments[s0][0], segments[s1][1])) + elif (segments[s1][1] - segments[s0][0]) >= minSize: + regions.append((segments[s0][0], segments[s1][1])) + sn = s1 + 2 + # while segments to process + elif 1: # running-average metric + while sn <= sx: + s0, s1 = sn, sn + while (s1 < sx) and ( + ( + sum(segments[s][2] for s in range(s0, s1 + 2)) + / sum(segments[s][3] for s in range(s0, s1 + 2)) + ) + >= minIdent + ): + s1 += 2 + if (segments[s1][1] - segments[s0][0] + 1) >= minSize: + regions.append((segments[s0][0], segments[s1][1])) + sn = s1 + 2 + # while segments to process + elif 0: # potential-average metric + while sn <= sx: + s0, s1 = sn, sn + while (s1 < sx) and ( + ( + sum(segments[s][2] for s in range(s0, s1 + 3)) + / sum(segments[s][3] for s in range(s0, s1 + 3)) + ) + >= minIdent + ): + s1 += 2 + if (segments[s1][1] - segments[s0][0] + 1) >= minSize: + regions.append((segments[s0][0], segments[s1][1])) + sn = s1 + 2 + # while segments to process + elif 0: # drop-worst metric v1 + partitions = [(sn, sx)] if (sn <= sx) else None + while partitions: + sn, sx = partitions.pop() + s0, s1 = sn, sx + while (s0 < s1) and ( + ( + sum(segments[s][2] for s in range(s0, s1 + 1)) + / sum(segments[s][3] for s in range(s0, s1 + 1)) + ) + < minIdent + ): + sw = [s1 - 1] + for s in range(s1 - 3, s0, -2): + if (segments[s][2] + 0.0001) < ( + segments[sw[0]][2] - 0.0001 + ): + sw = [s] + elif (segments[s][2] - 0.0001) <= ( + segments[sw[0]][2] + 0.0001 + ): + if segments[s][3] > segments[sw[0]][3]: + sw = [s] + elif segments[s][3] == segments[sw[0]][3]: + sw.append(s) + for s in sw: + partitions.append((s + 1, s1)) + s1 = s - 1 + # while segments need splitting + if (segments[s1][1] - segments[s0][0] + 1) >= minSize: + regions.append((segments[s0][0], segments[s1][1])) + # while segments to process + elif 0: # drop-worst metric v2 + partitions = [(sn, sx)] if (sn <= sx) else None + while partitions: + sn, sx = partitions.pop() + s0, s1 = sn, sx + while (s0 < s1) and ( + ( + sum(segments[s][2] for s in range(s0, s1 + 1)) + / sum(segments[s][3] for s in range(s0, s1 + 1)) + ) + < minIdent + ): + sw = [s1 - 1] + for s in range(s1 - 3, s0, -2): + if (minIdent * segments[s][3] - segments[s][2] - 0.0001) > ( + minIdent * segments[sw[0]][3] + - segments[sw[0]][2] + + 0.0001 + ): + sw = [s] + elif ( + minIdent * segments[s][3] - segments[s][2] + 0.0001 + ) >= ( + minIdent * segments[sw[0]][3] + - segments[sw[0]][2] + - 0.0001 + ): + if segments[s][3] > segments[sw[0]][3]: + sw = [s] + elif segments[s][3] == segments[sw[0]][3]: + sw.append(s) + for s in sw: + partitions.append((s + 1, s1)) + s1 = s - 1 + # while segments need splitting + if (segments[s1][1] - segments[s0][0] + 1) >= minSize: + regions.append((segments[s0][0], segments[s1][1])) + # while segments to process + else: + raise Exception("ERROR: no segment merge metrics are enabled") + # if metric + segments = list() + + # re-invert results if necessary + if reverse: + for r in range(len(regions) - 1, -1, -1): + tmpregions.append((-regions[r][1], -regions[r][0])) + regions = tmpregions + tmpregions = None + + # if we hit a declaration line or EOF, yield this band's regions + if (declaration or EOF) and regions: + yield regions + regions = list() + + # if we hit a declaration line but not EOF, reset the parser state + if declaration: + pos = int(declaration["start"]) + step = int(declaration["step"]) + state = None + curStart = pos + curSum = 0.0 + curCount = 0 + # while not EOF + + # getRegions() + + +# Source_ucsc_ecr diff --git a/loki/loaders/test/loki_source_genes.py b/loki/loaders/test/loki_source_genes.py index 38f253c..3bb61d5 100644 --- a/loki/loaders/test/loki_source_genes.py +++ b/loki/loaders/test/loki_source_genes.py @@ -4,123 +4,159 @@ class Source_genes(loki_source.Source): - - - @classmethod - def getVersionString(cls): - return '3.0 (2023-02-22)' - #getVersionString() - - - def download(self, options): - pass - #download() - - - def update(self, options): - # clear out all old data from this source - self.log("deleting old records from the database ...") - self.deleteAll() - self.log(" OK\n") - - # get or create the required metadata records - ldprofileID = self.addLDProfiles([ - ('', 'no LD adjustment', None, None), - ('ld', 'some LD adjustment', None, None), - ]) - namespaceID = self.addNamespaces([ - ('gene', 0), - ('entrez_gid', 0), # needed to resolve snp_entrez_role - ('protein', 1), - ]) - typeID = self.addTypes([ - ('gene',), - ]) - - # define genes - self.log("adding genes to the database ...") - listGene = [ - #(label,desc) - ('A', 'normal gene'), - ('B', 'normal gene'), - ('C', 'overlapping gene'), - ('D', 'overlapping gene'), - ('E', 'gene with 2 regions'), - ('F', 'gene with no SNPs'), - ('G', 'gene with no regions'), - ('H', 'overlapping gene'), - ('I', 'overlapping gene'), - ('P', 'gene with only nearby SNPs'), - ('Q', 'normal gene'), - ('R', 'normal gene'), - ('S', 'gene with 2 regions'), - ] - listBID = self.addTypedBiopolymers(typeID['gene'], listGene) - geneBID = dict(zip((g[0] for g in listGene), listBID)) - self.log(" OK: %d genes\n" % len(geneBID)) - - # define gene aliases - self.log("adding gene identifiers to the database ...") - genEName = ((bid,ord(g)-64) for g,bid in geneBID.items()) # A->1, B->2, ... S->19 ... Z->26 - self.addBiopolymerNamespacedNames(namespaceID['entrez_gid'], genEName) - listGName = [ - #(biopolymer_id,name) - # nothing has name 'Z' - (geneBID['A'], 'A'), (geneBID['A'], 'A2'), - (geneBID['B'], 'B'), - (geneBID['C'], 'C'), - (geneBID['D'], 'D'), (geneBID['D'], 'DE'), - (geneBID['E'], 'E'), (geneBID['E'], 'DE'), (geneBID['E'], 'EF'), - (geneBID['F'], 'F'), (geneBID['F'], 'EF'), (geneBID['F'], 'FG'), - (geneBID['G'], 'G'), (geneBID['G'], 'FG'), - (geneBID['H'], 'H'), - (geneBID['I'], 'I'), - (geneBID['P'], 'P'), - (geneBID['Q'], 'Q'), - (geneBID['R'], 'R'), - (geneBID['S'], 'S'), - ] - self.addBiopolymerNamespacedNames(namespaceID['gene'], listGName) - listPName = [ - #(biopolymer_id,name) - (geneBID['P'],'pqr'), (geneBID['P'],'qrp'), - (geneBID['Q'],'pqr'), (geneBID['Q'],'qrp'), (geneBID['Q'],'qrs'), - (geneBID['R'],'pqr'), (geneBID['R'],'qrp'), (geneBID['R'],'qrs'), - (geneBID['S'],'qrs'), - ] - self.addBiopolymerNamespacedNames(namespaceID['protein'], listPName) - self.log(" OK: %d identifiers\n" % (len(geneBID)+len(listGName)+len(listPName))) - - # TODO: name references? - - # define gene regions - self.log("adding gene regions to the database ...") - ld0 = ldprofileID[''] - ld1 = ldprofileID['ld'] - listRegion = [ - #(biopolymer_id,ldprofile_id,chr,posMin,posMax) - (geneBID['A'], ld0, 1, 8, 22), (geneBID['A'], ld1, 1, 6, 24), # expand both, no gain - (geneBID['B'], ld0, 1, 28, 52), (geneBID['B'], ld1, 1, 26, 52), # expand left, no gain - (geneBID['C'], ld0, 1, 54, 62), (geneBID['C'], ld1, 1, 48, 64), # expand both, gain dupe - (geneBID['D'], ld0, 1, 58, 72), (geneBID['D'], ld1, 1, 54, 74), # expand both, gain 1 - (geneBID['E'], ld0, 1, 78, 82), (geneBID['E'], ld1, 1, 78, 84), # expand in, no gain - (geneBID['E'], ld0, 1, 84, 92), (geneBID['E'], ld1, 1, 84, 94), # expand right, no gain - (geneBID['F'], ld0, 1, 94, 98), (geneBID['F'], ld1, 1, 94, 99), # expand right, no gain - # no regions for G - (geneBID['H'], ld0, 2, 22, 42), (geneBID['H'], ld1, 2, 22, 48), # expand to match - (geneBID['I'], ld0, 2, 38, 48), (geneBID['I'], ld1, 2, 22, 48), # expand to match - (geneBID['P'], ld0, 3, 14, 18), (geneBID['P'], ld1, 3, 16, 22), # expand both, gain 1 - (geneBID['Q'], ld0, 3, 28, 36), (geneBID['Q'], ld1, 3, 26, 42), # expand both, gain 1 between - (geneBID['R'], ld0, 3, 44, 52), (geneBID['R'], ld1, 3, 38, 54), # expand both, gain 1 between - (geneBID['S'], ld0, 3, 58, 64), (geneBID['S'], ld1, 3, 56, 72), # expand to dupe - (geneBID['S'], ld0, 3, 66, 72), (geneBID['S'], ld1, 3, 56, 72), # expand to dupe - ] - self.addBiopolymerRegions(listRegion) - self.log(" OK: %d regions\n" % len(listRegion)) - - # set the zone size to 7 so that a few things land right on zone edges - self._loki.setDatabaseSetting("zone_size", "7") - #update() - - -#Source_genes + + @classmethod + def getVersionString(cls): + return "3.0 (2023-02-22)" + + # getVersionString() + + def download(self, options): + pass + + # download() + + def update(self, options): + # clear out all old data from this source + self.log("deleting old records from the database ...") + self.deleteAll() + self.log(" OK\n") + + # get or create the required metadata records + ldprofileID = self.addLDProfiles( + [ + ("", "no LD adjustment", None, None), + ("ld", "some LD adjustment", None, None), + ] + ) + namespaceID = self.addNamespaces( + [ + ("gene", 0), + ("entrez_gid", 0), # needed to resolve snp_entrez_role + ("protein", 1), + ] + ) + typeID = self.addTypes( + [ + ("gene",), + ] + ) + + # define genes + self.log("adding genes to the database ...") + listGene = [ + # (label,desc) + ("A", "normal gene"), + ("B", "normal gene"), + ("C", "overlapping gene"), + ("D", "overlapping gene"), + ("E", "gene with 2 regions"), + ("F", "gene with no SNPs"), + ("G", "gene with no regions"), + ("H", "overlapping gene"), + ("I", "overlapping gene"), + ("P", "gene with only nearby SNPs"), + ("Q", "normal gene"), + ("R", "normal gene"), + ("S", "gene with 2 regions"), + ] + listBID = self.addTypedBiopolymers(typeID["gene"], listGene) + geneBID = dict(zip((g[0] for g in listGene), listBID)) + self.log(" OK: %d genes\n" % len(geneBID)) + + # define gene aliases + self.log("adding gene identifiers to the database ...") + genEName = ( + (bid, ord(g) - 64) for g, bid in geneBID.items() + ) # A->1, B->2, ... S->19 ... Z->26 + self.addBiopolymerNamespacedNames(namespaceID["entrez_gid"], genEName) + listGName = [ + # (biopolymer_id,name) + # nothing has name 'Z' + (geneBID["A"], "A"), + (geneBID["A"], "A2"), + (geneBID["B"], "B"), + (geneBID["C"], "C"), + (geneBID["D"], "D"), + (geneBID["D"], "DE"), + (geneBID["E"], "E"), + (geneBID["E"], "DE"), + (geneBID["E"], "EF"), + (geneBID["F"], "F"), + (geneBID["F"], "EF"), + (geneBID["F"], "FG"), + (geneBID["G"], "G"), + (geneBID["G"], "FG"), + (geneBID["H"], "H"), + (geneBID["I"], "I"), + (geneBID["P"], "P"), + (geneBID["Q"], "Q"), + (geneBID["R"], "R"), + (geneBID["S"], "S"), + ] + self.addBiopolymerNamespacedNames(namespaceID["gene"], listGName) + listPName = [ + # (biopolymer_id,name) + (geneBID["P"], "pqr"), + (geneBID["P"], "qrp"), + (geneBID["Q"], "pqr"), + (geneBID["Q"], "qrp"), + (geneBID["Q"], "qrs"), + (geneBID["R"], "pqr"), + (geneBID["R"], "qrp"), + (geneBID["R"], "qrs"), + (geneBID["S"], "qrs"), + ] + self.addBiopolymerNamespacedNames(namespaceID["protein"], listPName) + self.log( + " OK: %d identifiers\n" % (len(geneBID) + len(listGName) + len(listPName)) + ) + + # TODO: name references? + + # define gene regions + self.log("adding gene regions to the database ...") + ld0 = ldprofileID[""] + ld1 = ldprofileID["ld"] + listRegion = [ + # (biopolymer_id,ldprofile_id,chr,posMin,posMax) + (geneBID["A"], ld0, 1, 8, 22), + (geneBID["A"], ld1, 1, 6, 24), # expand both, no gain + (geneBID["B"], ld0, 1, 28, 52), + (geneBID["B"], ld1, 1, 26, 52), # expand left, no gain + (geneBID["C"], ld0, 1, 54, 62), + (geneBID["C"], ld1, 1, 48, 64), # expand both, gain dupe + (geneBID["D"], ld0, 1, 58, 72), + (geneBID["D"], ld1, 1, 54, 74), # expand both, gain 1 + (geneBID["E"], ld0, 1, 78, 82), + (geneBID["E"], ld1, 1, 78, 84), # expand in, no gain + (geneBID["E"], ld0, 1, 84, 92), + (geneBID["E"], ld1, 1, 84, 94), # expand right, no gain + (geneBID["F"], ld0, 1, 94, 98), + (geneBID["F"], ld1, 1, 94, 99), # expand right, no gain + # no regions for G + (geneBID["H"], ld0, 2, 22, 42), + (geneBID["H"], ld1, 2, 22, 48), # expand to match + (geneBID["I"], ld0, 2, 38, 48), + (geneBID["I"], ld1, 2, 22, 48), # expand to match + (geneBID["P"], ld0, 3, 14, 18), + (geneBID["P"], ld1, 3, 16, 22), # expand both, gain 1 + (geneBID["Q"], ld0, 3, 28, 36), + (geneBID["Q"], ld1, 3, 26, 42), # expand both, gain 1 between + (geneBID["R"], ld0, 3, 44, 52), + (geneBID["R"], ld1, 3, 38, 54), # expand both, gain 1 between + (geneBID["S"], ld0, 3, 58, 64), + (geneBID["S"], ld1, 3, 56, 72), # expand to dupe + (geneBID["S"], ld0, 3, 66, 72), + (geneBID["S"], ld1, 3, 56, 72), # expand to dupe + ] + self.addBiopolymerRegions(listRegion) + self.log(" OK: %d regions\n" % len(listRegion)) + + # set the zone size to 7 so that a few things land right on zone edges + self._loki.setDatabaseSetting("zone_size", "7") + + # update() + + +# Source_genes diff --git a/loki/loaders/test/loki_source_light.py b/loki/loaders/test/loki_source_light.py index 9653844..46c4ae4 100644 --- a/loki/loaders/test/loki_source_light.py +++ b/loki/loaders/test/loki_source_light.py @@ -4,103 +4,116 @@ class Source_light(loki_source.Source): - - - @classmethod - def getVersionString(cls): - return '3.0 (2023-02-22)' - #getVersionString() - - - def download(self, options): - pass - #download() - - - def update(self, options): - # clear out all old data from this source - self.log("deleting old records from the database ...") - self.deleteAll() - self.log(" OK\n") - - # get or create the required metadata records - namespaceID = self.addNamespaces([ - ('gene', 0), - ('group', 0), - ]) - relationshipID = self.addRelationships([ - ('shade_of',), - ('greener_than',), - ]) - typeID = self.addTypes([ - ('gene',), - ('group',), - ]) - subtypeID = self.addSubtypes([ - ('-',), - ]) - - # define groups - self.log("adding groups to the database ...") - listGroup = [ - #(label,description) - (subtypeID['-'], 'red', 'normal group'), - (subtypeID['-'], 'green', 'unknown member'), - (subtypeID['-'], 'blue', 'redundant member name'), - (subtypeID['-'], 'gray', 'large parent group'), - ] - listGID = self.addTypedGroups(typeID['group'], listGroup) - groupGID = dict(zip((g[1] for g in listGroup), listGID)) - self.log(" OK: %d groups\n" % len(groupGID)) - - # define group names - self.log("adding group names to the database ...") - listName = [ - #(group_id,name) - (groupGID['red'], 'red'), - (groupGID['green'], 'green'), - (groupGID['blue'], 'blue'), - (groupGID['gray'], 'gray'), - (groupGID['gray'], 'white'), - ] - self.addGroupNamespacedNames(namespaceID['group'], listName) - self.log(" OK: %d names\n" % len(listName)) - - # define group relationships - self.log("adding group relationships to the database ...") - listRel = [ - #(group_id,related_group_id,relationship_id,contains) - (groupGID['red'], groupGID['gray'], relationshipID['shade_of'], -1), - (groupGID['green'], groupGID['gray'], relationshipID['shade_of'], -1), - (groupGID['green'], groupGID['blue'], relationshipID['greener_than'], 0), - (groupGID['blue'], groupGID['gray'], relationshipID['shade_of'], -1), - ] - self.addGroupRelationships(listRel) - self.log(" OK: %d relationships\n" % len(listRel)) - - # define group members - self.log("adding group members to the database ...") - listMember = [ - #(group_id,member,name) - (groupGID['red'], 11, 'A'), - (groupGID['red'], 12, 'B'), - (groupGID['green'], 21, 'Z'), - (groupGID['green'], 22, 'A'), - (groupGID['green'], 23, 'B'), - (groupGID['blue'], 31, 'A'), - (groupGID['blue'], 31, 'A2'), - (groupGID['blue'], 32, 'C'), - (groupGID['gray'], 41, 'A2'), - (groupGID['gray'], 42, 'B'), - (groupGID['gray'], 43, 'C'), - (groupGID['gray'], 44, 'D'), - (groupGID['gray'], 45, 'E'), - (groupGID['gray'], 46, 'F'), - (groupGID['gray'], 47, 'G'), - ] - self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID['gene'], listMember) - self.log(" OK: %d members (%d identifiers)\n" % (len(set(m[1] for m in listMember)),len(listMember))) - #update() - - -#Source_light + + @classmethod + def getVersionString(cls): + return "3.0 (2023-02-22)" + + # getVersionString() + + def download(self, options): + pass + + # download() + + def update(self, options): + # clear out all old data from this source + self.log("deleting old records from the database ...") + self.deleteAll() + self.log(" OK\n") + + # get or create the required metadata records + namespaceID = self.addNamespaces( + [ + ("gene", 0), + ("group", 0), + ] + ) + relationshipID = self.addRelationships( + [ + ("shade_of",), + ("greener_than",), + ] + ) + typeID = self.addTypes( + [ + ("gene",), + ("group",), + ] + ) + subtypeID = self.addSubtypes( + [ + ("-",), + ] + ) + + # define groups + self.log("adding groups to the database ...") + listGroup = [ + # (label,description) + (subtypeID["-"], "red", "normal group"), + (subtypeID["-"], "green", "unknown member"), + (subtypeID["-"], "blue", "redundant member name"), + (subtypeID["-"], "gray", "large parent group"), + ] + listGID = self.addTypedGroups(typeID["group"], listGroup) + groupGID = dict(zip((g[1] for g in listGroup), listGID)) + self.log(" OK: %d groups\n" % len(groupGID)) + + # define group names + self.log("adding group names to the database ...") + listName = [ + # (group_id,name) + (groupGID["red"], "red"), + (groupGID["green"], "green"), + (groupGID["blue"], "blue"), + (groupGID["gray"], "gray"), + (groupGID["gray"], "white"), + ] + self.addGroupNamespacedNames(namespaceID["group"], listName) + self.log(" OK: %d names\n" % len(listName)) + + # define group relationships + self.log("adding group relationships to the database ...") + listRel = [ + # (group_id,related_group_id,relationship_id,contains) + (groupGID["red"], groupGID["gray"], relationshipID["shade_of"], -1), + (groupGID["green"], groupGID["gray"], relationshipID["shade_of"], -1), + (groupGID["green"], groupGID["blue"], relationshipID["greener_than"], 0), + (groupGID["blue"], groupGID["gray"], relationshipID["shade_of"], -1), + ] + self.addGroupRelationships(listRel) + self.log(" OK: %d relationships\n" % len(listRel)) + + # define group members + self.log("adding group members to the database ...") + listMember = [ + # (group_id,member,name) + (groupGID["red"], 11, "A"), + (groupGID["red"], 12, "B"), + (groupGID["green"], 21, "Z"), + (groupGID["green"], 22, "A"), + (groupGID["green"], 23, "B"), + (groupGID["blue"], 31, "A"), + (groupGID["blue"], 31, "A2"), + (groupGID["blue"], 32, "C"), + (groupGID["gray"], 41, "A2"), + (groupGID["gray"], 42, "B"), + (groupGID["gray"], 43, "C"), + (groupGID["gray"], 44, "D"), + (groupGID["gray"], 45, "E"), + (groupGID["gray"], 46, "F"), + (groupGID["gray"], 47, "G"), + ] + self.addGroupMemberTypedNamespacedNames( + typeID["gene"], namespaceID["gene"], listMember + ) + self.log( + " OK: %d members (%d identifiers)\n" + % (len(set(m[1] for m in listMember)), len(listMember)) + ) + + # update() + + +# Source_light diff --git a/loki/loaders/test/loki_source_paint.py b/loki/loaders/test/loki_source_paint.py index 8d36d59..699075d 100644 --- a/loki/loaders/test/loki_source_paint.py +++ b/loki/loaders/test/loki_source_paint.py @@ -4,99 +4,120 @@ class Source_paint(loki_source.Source): - - - @classmethod - def getVersionString(cls): - return '3.0 (2023-02-22)' - #getVersionString() - - - def download(self, options): - pass - #download() - - - def update(self, options): - # clear out all old data from this source - self.log("deleting old records from the database ...") - self.deleteAll() - self.log(" OK\n") - - # get or create the required metadata records - namespaceID = self.addNamespaces([ - ('gene', 0), - ('group', 0), - ]) - relationshipID = self.addRelationships([ - ('different_than',), - ]) - typeID = self.addTypes([ - ('gene',), - ('group',), - ]) - subtypeID = self.addSubtypes([ - ('-',), - ]) - - # define groups - self.log("adding groups to the database ...") - listGroup = [ - #(label,description) - (subtypeID['-'], 'cyan', 'gene ambiguity resolved by either heuristic'), - (subtypeID['-'], 'magenta', 'gene ambiguity resolved only by implication heuristic'), - (subtypeID['-'], 'yellow', 'gene ambiguity resolved only by quality heuristic'), - (subtypeID['-'], 'gray', 'unresolvable gene ambiguity'), - ] - listGID = self.addTypedGroups(typeID['group'], listGroup) - groupGID = dict(zip((g[1] for g in listGroup), listGID)) - self.log(" OK: %d groups\n" % len(groupGID)) - - # define group names - self.log("adding group names to the database ...") - listName = [ - #(group_id,name) - (groupGID['cyan'], 'cyan'), - (groupGID['magenta'], 'magenta'), - (groupGID['yellow'], 'yellow'), - (groupGID['gray'], 'gray'), - (groupGID['gray'], 'black'), - ] - self.addGroupNamespacedNames(namespaceID['group'], listName) - self.log(" OK: %d names\n" % len(listName)) - - # define group relationships - self.log("adding group relationships to the database ...") - listRel = [ - #(group_id,related_group_id,relationship_id) - (groupGID['cyan'], groupGID['magenta'], relationshipID['different_than']), - (groupGID['magenta'], groupGID['yellow'], relationshipID['different_than']), - (groupGID['yellow'], groupGID['cyan'], relationshipID['different_than']), - ] - self.addGroupSiblingRelationships(listRel) - self.log(" OK: %d relationships\n" % len(listRel)) - - # define group members - self.log("adding group members to the database ...") - listMember = [ - #(group_id,member,name) - (groupGID['cyan'], 11, 'A2'), - (groupGID['cyan'], 12, 'C'), - (groupGID['cyan'], 13, 'D'), - (groupGID['cyan'], 13, 'DE'), - (groupGID['magenta'], 21, 'DE'), - (groupGID['magenta'], 21, 'EF'), - (groupGID['magenta'], 21, 'G'), - (groupGID['yellow'], 31, 'EF'), - (groupGID['yellow'], 31, 'FG'), - (groupGID['yellow'], 31, 'G'), - (groupGID['gray'], 41, 'F'), - (groupGID['gray'], 41, 'FG'), - (groupGID['gray'], 41, 'G'), - ] - self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID['gene'], listMember) - self.log(" OK: %d members (%d identifiers)\n" % (len(set(m[1] for m in listMember)),len(listMember))) - #update() - - -#Source_paint + + @classmethod + def getVersionString(cls): + return "3.0 (2023-02-22)" + + # getVersionString() + + def download(self, options): + pass + + # download() + + def update(self, options): + # clear out all old data from this source + self.log("deleting old records from the database ...") + self.deleteAll() + self.log(" OK\n") + + # get or create the required metadata records + namespaceID = self.addNamespaces( + [ + ("gene", 0), + ("group", 0), + ] + ) + relationshipID = self.addRelationships( + [ + ("different_than",), + ] + ) + typeID = self.addTypes( + [ + ("gene",), + ("group",), + ] + ) + subtypeID = self.addSubtypes( + [ + ("-",), + ] + ) + + # define groups + self.log("adding groups to the database ...") + listGroup = [ + # (label,description) + (subtypeID["-"], "cyan", "gene ambiguity resolved by either heuristic"), + ( + subtypeID["-"], + "magenta", + "gene ambiguity resolved only by implication heuristic", + ), + ( + subtypeID["-"], + "yellow", + "gene ambiguity resolved only by quality heuristic", + ), + (subtypeID["-"], "gray", "unresolvable gene ambiguity"), + ] + listGID = self.addTypedGroups(typeID["group"], listGroup) + groupGID = dict(zip((g[1] for g in listGroup), listGID)) + self.log(" OK: %d groups\n" % len(groupGID)) + + # define group names + self.log("adding group names to the database ...") + listName = [ + # (group_id,name) + (groupGID["cyan"], "cyan"), + (groupGID["magenta"], "magenta"), + (groupGID["yellow"], "yellow"), + (groupGID["gray"], "gray"), + (groupGID["gray"], "black"), + ] + self.addGroupNamespacedNames(namespaceID["group"], listName) + self.log(" OK: %d names\n" % len(listName)) + + # define group relationships + self.log("adding group relationships to the database ...") + listRel = [ + # (group_id,related_group_id,relationship_id) + (groupGID["cyan"], groupGID["magenta"], relationshipID["different_than"]), + (groupGID["magenta"], groupGID["yellow"], relationshipID["different_than"]), + (groupGID["yellow"], groupGID["cyan"], relationshipID["different_than"]), + ] + self.addGroupSiblingRelationships(listRel) + self.log(" OK: %d relationships\n" % len(listRel)) + + # define group members + self.log("adding group members to the database ...") + listMember = [ + # (group_id,member,name) + (groupGID["cyan"], 11, "A2"), + (groupGID["cyan"], 12, "C"), + (groupGID["cyan"], 13, "D"), + (groupGID["cyan"], 13, "DE"), + (groupGID["magenta"], 21, "DE"), + (groupGID["magenta"], 21, "EF"), + (groupGID["magenta"], 21, "G"), + (groupGID["yellow"], 31, "EF"), + (groupGID["yellow"], 31, "FG"), + (groupGID["yellow"], 31, "G"), + (groupGID["gray"], 41, "F"), + (groupGID["gray"], 41, "FG"), + (groupGID["gray"], 41, "G"), + ] + self.addGroupMemberTypedNamespacedNames( + typeID["gene"], namespaceID["gene"], listMember + ) + self.log( + " OK: %d members (%d identifiers)\n" + % (len(set(m[1] for m in listMember)), len(listMember)) + ) + + # update() + + +# Source_paint diff --git a/loki/loaders/test/loki_source_snps.py b/loki/loaders/test/loki_source_snps.py index 7726ee4..de67161 100644 --- a/loki/loaders/test/loki_source_snps.py +++ b/loki/loaders/test/loki_source_snps.py @@ -4,104 +4,107 @@ class Source_snps(loki_source.Source): - - - @classmethod - def getVersionString(cls): - return '2.0 (2013-02-14)' - #getVersionString() - - - def download(self, options): - pass - #download() - - - def update(self, options): - # clear out all old data from this source - self.log("deleting old records from the database ...") - self.deleteAll() - self.log(" OK\n") - - # define positions - self.log("adding SNPs to the database ...") - listSNP = [ - #(rs,chr,pos,valid) - (11, 1, 10, 1), - (12, 1, 20, 1), - (13, 1, 35, 1), - (14, 1, 35, 1), - (15, 1, 50, 1), - (15, 1, 55, 1), - (16, 1, 60, 1), - (17, 1, 70, 1), - (18, 1, 80, 1), - (19, 1, 90, 1), - (21, 2, 10, 1), - (22, 2, 20, 1), - (23, 2, 30, 0), - (24, 2, 40, 1), - (25, 2, 50, 1), - (31, 3, 10, 0), - (32, 3, 20, 1), - (33, 3, 30, 1), - (34, 3, 40, 1), - (35, 3, 50, 1), - (36, 3, 60, 1), - (37, 3, 70, 0), - ] - self.addSNPLoci(listSNP) - self.log(" OK: %d SNP positions (%d RS#s)\n" % (len(listSNP),len(set(s[0] for s in listSNP)))) - - # define merges - self.log("adding SNP merge records to the database ...") - listMerge = [ - #(rsOld,rsNew) - (9,19), - ] - self.addSNPMerges(listMerge) - self.log(" OK: %d merges\n" % len(listMerge)) - - # define role codes - self.log("adding SNP role codes to the database ...") - listRole = [ - #(role,desc,coding,exon) - ('exon', 'exon', 1, 1), - ('utr', 'untranslated region', 0, 1), - ('intron', 'intron', 0, 0), - ('reg', 'regulatory', 1, 0), - ] - roleID = self.addRoles(listRole) - self.log(" OK: %d role codes\n" % len(roleID)) - - # define SNP roles - self.log("adding SNP roles to the database ...") - listSNPRole = [ - #(rs,entrez_id,role_id) - (11,0,roleID['reg']), - (12,1,roleID['exon']), - (13,2,roleID['utr']), - (13,2,roleID['intron']), - # no role for rs14 which overlaps rs13 - (15,2,roleID['reg']), - (15,3,roleID['exon']), - (16,3,roleID['intron']), - (16,4,roleID['intron']), - (17,3,roleID['reg']), - (18,5,roleID['exon']), - ( 9,5,roleID['exon']), # rs9 merged -> rs19 - # no role for rs21 - (22,8,roleID['utr']), - (23,8,roleID['intron']), - (24,8,roleID['reg']), - (24,9,roleID['exon']), - (25,16,roleID['reg']), - (36,19,roleID['intron']), - (37,19,roleID['exon']), - ] - self.addSNPEntrezRoles(listSNPRole) - self.log(" OK: %d roles\n" % len(listSNPRole)) - #update() - - -#Source_snps + + @classmethod + def getVersionString(cls): + return "2.0 (2013-02-14)" + + # getVersionString() + + def download(self, options): + pass + + # download() + + def update(self, options): + # clear out all old data from this source + self.log("deleting old records from the database ...") + self.deleteAll() + self.log(" OK\n") + + # define positions + self.log("adding SNPs to the database ...") + listSNP = [ + # (rs,chr,pos,valid) + (11, 1, 10, 1), + (12, 1, 20, 1), + (13, 1, 35, 1), + (14, 1, 35, 1), + (15, 1, 50, 1), + (15, 1, 55, 1), + (16, 1, 60, 1), + (17, 1, 70, 1), + (18, 1, 80, 1), + (19, 1, 90, 1), + (21, 2, 10, 1), + (22, 2, 20, 1), + (23, 2, 30, 0), + (24, 2, 40, 1), + (25, 2, 50, 1), + (31, 3, 10, 0), + (32, 3, 20, 1), + (33, 3, 30, 1), + (34, 3, 40, 1), + (35, 3, 50, 1), + (36, 3, 60, 1), + (37, 3, 70, 0), + ] + self.addSNPLoci(listSNP) + self.log( + " OK: %d SNP positions (%d RS#s)\n" + % (len(listSNP), len(set(s[0] for s in listSNP))) + ) + + # define merges + self.log("adding SNP merge records to the database ...") + listMerge = [ + # (rsOld,rsNew) + (9, 19), + ] + self.addSNPMerges(listMerge) + self.log(" OK: %d merges\n" % len(listMerge)) + + # define role codes + self.log("adding SNP role codes to the database ...") + listRole = [ + # (role,desc,coding,exon) + ("exon", "exon", 1, 1), + ("utr", "untranslated region", 0, 1), + ("intron", "intron", 0, 0), + ("reg", "regulatory", 1, 0), + ] + roleID = self.addRoles(listRole) + self.log(" OK: %d role codes\n" % len(roleID)) + + # define SNP roles + self.log("adding SNP roles to the database ...") + listSNPRole = [ + # (rs,entrez_id,role_id) + (11, 0, roleID["reg"]), + (12, 1, roleID["exon"]), + (13, 2, roleID["utr"]), + (13, 2, roleID["intron"]), + # no role for rs14 which overlaps rs13 + (15, 2, roleID["reg"]), + (15, 3, roleID["exon"]), + (16, 3, roleID["intron"]), + (16, 4, roleID["intron"]), + (17, 3, roleID["reg"]), + (18, 5, roleID["exon"]), + (9, 5, roleID["exon"]), # rs9 merged -> rs19 + # no role for rs21 + (22, 8, roleID["utr"]), + (23, 8, roleID["intron"]), + (24, 8, roleID["reg"]), + (24, 9, roleID["exon"]), + (25, 16, roleID["reg"]), + (36, 19, roleID["intron"]), + (37, 19, roleID["exon"]), + ] + self.addSNPEntrezRoles(listSNPRole) + self.log(" OK: %d roles\n" % len(listSNPRole)) + + # update() + + +# Source_snps diff --git a/loki/loaders/test/loki_source_spectrum.py b/loki/loaders/test/loki_source_spectrum.py index b5c9c1f..9392047 100644 --- a/loki/loaders/test/loki_source_spectrum.py +++ b/loki/loaders/test/loki_source_spectrum.py @@ -4,80 +4,89 @@ class Source_spectrum(loki_source.Source): - - - @classmethod - def getVersionString(cls): - return '3.0 (2023-02-22)' - #getVersionString() - - - def download(self, options): - pass - #download() - - - def update(self, options): - # clear out all old data from this source - self.log("deleting old records from the database ...") - self.deleteAll() - self.log(" OK\n") - - # get or create the required metadata records - namespaceID = self.addNamespaces([ - ('gene', 0), - ('protein', 1), - ('group', 0), - ]) - typeID = self.addTypes([ - ('gene',), - ('group',), - ]) - subtypeID = self.addSubtypes([ - ('-',), - ]) - - # define groups - self.log("adding groups to the database ...") - listGroup = [ - #(label,description) - (subtypeID['-'], 'orange', 'one protein, no ambiguity'), - (subtypeID['-'], 'indigo', 'redundant proteins, extraneous gene'), - (subtypeID['-'], 'violet', 'reducible protein ambiguity'), - ] - listGID = self.addTypedGroups(typeID['group'], listGroup) - groupGID = dict(zip((g[1] for g in listGroup), listGID)) - self.log(" OK: %d groups\n" % len(groupGID)) - - # define group names - self.log("adding group names to the database ...") - listName = [ - #(group_id,name) - (groupGID['orange'], 'orange'), - (groupGID['indigo'], 'indigo'), - (groupGID['violet'], 'violet'), - (groupGID['violet'], 'purple'), - ] - self.addGroupNamespacedNames(namespaceID['group'], listName) - self.log(" OK: %d names\n" % len(listName)) - - # define group members - self.log("adding group members to the database ...") - listMember = [ - #(group_id,member,type_id,namespace_id,name) - (groupGID['orange'], 11, typeID['gene'], namespaceID['gene'], 'P'), - (groupGID['orange'], 11, typeID['gene'], namespaceID['gene'], 'Q'), - (groupGID['orange'], 11, typeID['gene'], namespaceID['gene'], 'R'), - (groupGID['orange'], 11, typeID['gene'], namespaceID['protein'], 'pqr'), - (groupGID['indigo'], 21, typeID['gene'], namespaceID['protein'], 'pqr'), - (groupGID['indigo'], 21, typeID['gene'], namespaceID['protein'], 'qrp'), - (groupGID['indigo'], 21, typeID['gene'], namespaceID['gene'], 'S'), - (groupGID['violet'], 31, typeID['gene'], namespaceID['protein'], 'qrp'), - (groupGID['violet'], 31, typeID['gene'], namespaceID['protein'], 'qrs'), - ] - self.addGroupMemberNames(listMember) - self.log(" OK: %d members (%d identifiers)\n" % (len(set(m[1] for m in listMember)),len(listMember))) - #update() - - -#Source_spectrum + + @classmethod + def getVersionString(cls): + return "3.0 (2023-02-22)" + + # getVersionString() + + def download(self, options): + pass + + # download() + + def update(self, options): + # clear out all old data from this source + self.log("deleting old records from the database ...") + self.deleteAll() + self.log(" OK\n") + + # get or create the required metadata records + namespaceID = self.addNamespaces( + [ + ("gene", 0), + ("protein", 1), + ("group", 0), + ] + ) + typeID = self.addTypes( + [ + ("gene",), + ("group",), + ] + ) + subtypeID = self.addSubtypes( + [ + ("-",), + ] + ) + + # define groups + self.log("adding groups to the database ...") + listGroup = [ + # (label,description) + (subtypeID["-"], "orange", "one protein, no ambiguity"), + (subtypeID["-"], "indigo", "redundant proteins, extraneous gene"), + (subtypeID["-"], "violet", "reducible protein ambiguity"), + ] + listGID = self.addTypedGroups(typeID["group"], listGroup) + groupGID = dict(zip((g[1] for g in listGroup), listGID)) + self.log(" OK: %d groups\n" % len(groupGID)) + + # define group names + self.log("adding group names to the database ...") + listName = [ + # (group_id,name) + (groupGID["orange"], "orange"), + (groupGID["indigo"], "indigo"), + (groupGID["violet"], "violet"), + (groupGID["violet"], "purple"), + ] + self.addGroupNamespacedNames(namespaceID["group"], listName) + self.log(" OK: %d names\n" % len(listName)) + + # define group members + self.log("adding group members to the database ...") + listMember = [ + # (group_id,member,type_id,namespace_id,name) + (groupGID["orange"], 11, typeID["gene"], namespaceID["gene"], "P"), + (groupGID["orange"], 11, typeID["gene"], namespaceID["gene"], "Q"), + (groupGID["orange"], 11, typeID["gene"], namespaceID["gene"], "R"), + (groupGID["orange"], 11, typeID["gene"], namespaceID["protein"], "pqr"), + (groupGID["indigo"], 21, typeID["gene"], namespaceID["protein"], "pqr"), + (groupGID["indigo"], 21, typeID["gene"], namespaceID["protein"], "qrp"), + (groupGID["indigo"], 21, typeID["gene"], namespaceID["gene"], "S"), + (groupGID["violet"], 31, typeID["gene"], namespaceID["protein"], "qrp"), + (groupGID["violet"], 31, typeID["gene"], namespaceID["protein"], "qrs"), + ] + self.addGroupMemberNames(listMember) + self.log( + " OK: %d members (%d identifiers)\n" + % (len(set(m[1] for m in listMember)), len(listMember)) + ) + + # update() + + +# Source_spectrum diff --git a/loki/loki_db.py b/loki/loki_db.py index 4f86064..141ef2d 100644 --- a/loki/loki_db.py +++ b/loki/loki_db.py @@ -13,172 +13,189 @@ # Docstring has not been inspected line by line ################################################## + class Database(object): - """ - A class to interact with a SQLite database using APSW. - - Attributes: - chr_num (dict): A dictionary mapping chromosome names and numbers. - chr_name (dict): A dictionary mapping chromosome numbers to names. - _schema (dict): A dictionary containing the schema definition for the database. - """ - - ################################################## - # class interrogation - - @classmethod - def getVersionTuple(cls): - """ - Returns the version information of the database as a tuple. - - Returns: - tuple: A tuple containing (major, minor, revision, dev, build, date). - """ - # tuple = (major,minor,revision,dev,build,date) - # dev must be in ('a','b','rc','release') for lexicographic comparison - return (2,2,5,'release','','2019-03-15') - #getVersionTuple() - - - @classmethod - def getVersionString(cls): - """ - Returns the version information of the database as a formatted string. - - Returns: - str: A formatted version string. - """ - v = list(cls.getVersionTuple()) - # tuple = (major,minor,revision,dev,build,date) - # dev must be > 'rc' for releases for lexicographic comparison, - # but we don't need to actually print 'release' in the version string - v[3] = '' if v[3] > 'rc' else v[3] - return "%d.%d.%d%s%s (%s)" % tuple(v) - #getVersionString() - - - @classmethod - def getDatabaseDriverName(cls): - """ - Returns the name of the database driver. - - Returns: - str: The database driver name. - """ - return "SQLite" - #getDatabaseDriverName() - - - @classmethod - def getDatabaseDriverVersion(cls): - """ - Returns the version of the SQLite library. - - Returns: - str: The SQLite library version. - """ - return apsw.sqlitelibversion() - #getDatabaseDriverVersion() - - - @classmethod - def getDatabaseInterfaceName(cls): - """ - Returns the name of the database interface. - - Returns: - str: The database interface name. - """ - return "APSW" - #getDatabaseInterfaceName() - - - @classmethod - def getDatabaseInterfaceVersion(cls): - """ - Returns the version of the APSW library. - - Returns: - str: The APSW library version. - """ - return apsw.apswversion() - #getDatabaseInterfaceVersion() - - - ################################################## - # public class data - - - # hardcode translations between chromosome numbers and textual tags - chr_num = {} - chr_name = {} - cnum = 0 - for cname in ('1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','X','Y','XY','MT'): - cnum += 1 - chr_num[cnum] = cnum - chr_num['%s' % cnum] = cnum - chr_num[cname] = cnum - chr_name[cnum] = cname - chr_name['%s' % cnum] = cname - chr_name[cname] = cname - chr_num['M'] = chr_num['MT'] - chr_name['M'] = chr_name['MT'] - - - ################################################## - # private class data - - - _schema = { - 'db': { - ################################################## - # configuration tables - - - 'setting': { - 'table': """ + """ + A class to interact with a SQLite database using APSW. + + Attributes: + chr_num (dict): A dictionary mapping chromosome names and numbers. + chr_name (dict): A dictionary mapping chromosome numbers to names. + _schema (dict): A dictionary containing the schema definition for the database. + """ + + ################################################## + # class interrogation + + @classmethod + def getVersionTuple(cls): + """ + Returns the version information of the database as a tuple. + + Returns: + tuple: A tuple containing (major, minor, revision, dev, build, date). + """ + # tuple = (major,minor,revision,dev,build,date) + # dev must be in ('a','b','rc','release') for lexicographic comparison + return (2, 2, 5, "release", "", "2019-03-15") + + # getVersionTuple() + + @classmethod + def getVersionString(cls): + """ + Returns the version information of the database as a formatted string. + + Returns: + str: A formatted version string. + """ + v = list(cls.getVersionTuple()) + # tuple = (major,minor,revision,dev,build,date) + # dev must be > 'rc' for releases for lexicographic comparison, + # but we don't need to actually print 'release' in the version string + v[3] = "" if v[3] > "rc" else v[3] + return "%d.%d.%d%s%s (%s)" % tuple(v) + + # getVersionString() + + @classmethod + def getDatabaseDriverName(cls): + """ + Returns the name of the database driver. + + Returns: + str: The database driver name. + """ + return "SQLite" + + # getDatabaseDriverName() + + @classmethod + def getDatabaseDriverVersion(cls): + """ + Returns the version of the SQLite library. + + Returns: + str: The SQLite library version. + """ + return apsw.sqlitelibversion() + + # getDatabaseDriverVersion() + + @classmethod + def getDatabaseInterfaceName(cls): + """ + Returns the name of the database interface. + + Returns: + str: The database interface name. + """ + return "APSW" + + # getDatabaseInterfaceName() + + @classmethod + def getDatabaseInterfaceVersion(cls): + """ + Returns the version of the APSW library. + + Returns: + str: The APSW library version. + """ + return apsw.apswversion() + + # getDatabaseInterfaceVersion() + + ################################################## + # public class data + + # hardcode translations between chromosome numbers and textual tags + chr_num = {} + chr_name = {} + cnum = 0 + for cname in ( + "1", + "2", + "3", + "4", + "5", + "6", + "7", + "8", + "9", + "10", + "11", + "12", + "13", + "14", + "15", + "16", + "17", + "18", + "19", + "20", + "21", + "22", + "X", + "Y", + "XY", + "MT", + ): + cnum += 1 + chr_num[cnum] = cnum + chr_num["%s" % cnum] = cnum + chr_num[cname] = cnum + chr_name[cnum] = cname + chr_name["%s" % cnum] = cname + chr_name[cname] = cname + chr_num["M"] = chr_num["MT"] + chr_name["M"] = chr_name["MT"] + + ################################################## + # private class data + + _schema = { + "db": { + ################################################## + # configuration tables + "setting": { + "table": """ ( setting VARCHAR(32) PRIMARY KEY NOT NULL, value VARCHAR(256) ) """, - 'data': [ - ('schema','3'), - ('ucschg',None), - ('zone_size','100000'), - ('optimized','0'), - ('finalized','0'), - ], - 'index': {} - }, #.db.setting - - - ################################################## - # metadata tables - - - 'grch_ucschg': { - 'table': """ + "data": [ + ("schema", "3"), + ("ucschg", None), + ("zone_size", "100000"), + ("optimized", "0"), + ("finalized", "0"), + ], + "index": {}, + }, # .db.setting + ################################################## + # metadata tables + "grch_ucschg": { + "table": """ ( grch INTEGER PRIMARY KEY, ucschg INTEGER NOT NULL ) """, - # translations known at time of writing are still provided, - # but additional translations will also be fetched at update - 'data': [ - (34,16), - (35,17), - (36,18), - (37,19), - (38,38), - ], - 'index': {} - }, #.db.grch_ucschg - - - 'ldprofile': { - 'table': """ + # translations known at time of writing are still provided, + # but additional translations will also be fetched at update + "data": [ + (34, 16), + (35, 17), + (36, 18), + (37, 19), + (38, 38), + ], + "index": {}, + }, # .db.grch_ucschg + "ldprofile": { + "table": """ ( ldprofile_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, ldprofile VARCHAR(32) UNIQUE NOT NULL, @@ -187,35 +204,29 @@ def getDatabaseInterfaceVersion(cls): value DOUBLE ) """, - 'index': {} - }, #.db.ldprofile - - - 'namespace': { - 'table': """ + "index": {}, + }, # .db.ldprofile + "namespace": { + "table": """ ( namespace_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, namespace VARCHAR(32) UNIQUE NOT NULL, polygenic TINYINT NOT NULL DEFAULT 0 ) """, - 'index': {} - }, #.db.namespace - - - 'relationship': { - 'table': """ + "index": {}, + }, # .db.namespace + "relationship": { + "table": """ ( relationship_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, relationship VARCHAR(32) UNIQUE NOT NULL ) """, - 'index': {} - }, #.db.relationship - - - 'role': { - 'table': """ + "index": {}, + }, # .db.relationship + "role": { + "table": """ ( role_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, role VARCHAR(32) UNIQUE NOT NULL, @@ -224,12 +235,10 @@ def getDatabaseInterfaceVersion(cls): exon TINYINT ) """, - 'index': {} - }, #.db.role - - - 'source': { - 'table': """ + "index": {}, + }, # .db.role + "source": { + "table": """ ( source_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, source VARCHAR(32) UNIQUE NOT NULL, @@ -240,12 +249,10 @@ def getDatabaseInterfaceVersion(cls): current_ucschg INTEGER ) """, - 'index': {} - }, #.db.source - - - 'source_option': { - 'table': """ + "index": {}, + }, # .db.source + "source_option": { + "table": """ ( source_id TINYINT NOT NULL, option VARCHAR(32) NOT NULL, @@ -253,12 +260,10 @@ def getDatabaseInterfaceVersion(cls): PRIMARY KEY (source_id, option) ) """, - 'index': {} - }, #.db.source_option - - - 'source_file': { - 'table': """ + "index": {}, + }, # .db.source_option + "source_file": { + "table": """ ( source_id TINYINT NOT NULL, filename VARCHAR(256) NOT NULL, @@ -268,65 +273,54 @@ def getDatabaseInterfaceVersion(cls): PRIMARY KEY (source_id, filename) ) """, - 'index': {} - }, #.db.source_file - - - 'type': { - 'table': """ + "index": {}, + }, # .db.source_file + "type": { + "table": """ ( type_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, type VARCHAR(32) UNIQUE NOT NULL ) """, - 'index': {} - }, #.db.type - - 'subtype': { - 'table': """ + "index": {}, + }, # .db.type + "subtype": { + "table": """ ( subtype_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, subtype VARCHAR(32) UNIQUE NOT NULL ) """, - 'index': {} - }, #.db.subtype - - - 'warning': { - 'table': """ + "index": {}, + }, # .db.subtype + "warning": { + "table": """ ( warning_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, source_id TINYINT NOT NULL, warning VARCHAR(8192) ) """, - 'index': { - 'warning__source': '(source_id)', - } - }, #.db.warning - - - ################################################## - # snp tables - - - 'snp_merge': { - 'table': """ + "index": { + "warning__source": "(source_id)", + }, + }, # .db.warning + ################################################## + # snp tables + "snp_merge": { + "table": """ ( rsMerged INTEGER NOT NULL, rsCurrent INTEGER NOT NULL, source_id TINYINT NOT NULL ) """, - 'index': { - 'snp_merge__merge_current': '(rsMerged,rsCurrent)', - } - }, #.db.snp_merge - - - 'snp_locus': { # all coordinates in LOKI are 1-based closed intervals - 'table': """ + "index": { + "snp_merge__merge_current": "(rsMerged,rsCurrent)", + }, + }, # .db.snp_merge + "snp_locus": { # all coordinates in LOKI are 1-based closed intervals + "table": """ ( rs INTEGER NOT NULL, chr TINYINT NOT NULL, @@ -335,17 +329,15 @@ def getDatabaseInterfaceVersion(cls): source_id TINYINT NOT NULL ) """, - 'index': { - 'snp_locus__rs_chr_pos': '(rs,chr,pos)', - 'snp_locus__chr_pos_rs': '(chr,pos,rs)', - # a (validated,...) index would be nice but adds >1GB to the file size :/ - #'snp_locus__valid_chr_pos_rs': '(validated,chr,pos,rs)', - } - }, #.db.snp_locus - - - 'snp_entrez_role': { - 'table': """ + "index": { + "snp_locus__rs_chr_pos": "(rs,chr,pos)", + "snp_locus__chr_pos_rs": "(chr,pos,rs)", + # a (validated,...) index would be nice but adds >1GB to the file size :/ + #'snp_locus__valid_chr_pos_rs': '(validated,chr,pos,rs)', + }, + }, # .db.snp_locus + "snp_entrez_role": { + "table": """ ( rs INTEGER NOT NULL, entrez_id INTEGER NOT NULL, @@ -353,14 +345,12 @@ def getDatabaseInterfaceVersion(cls): source_id TINYINT NOT NULL ) """, - 'index': { - 'snp_entrez_role__rs_entrez_role': '(rs,entrez_id,role_id)', - } - }, #.db.snp_entrez_role - - - 'snp_biopolymer_role': { - 'table': """ + "index": { + "snp_entrez_role__rs_entrez_role": "(rs,entrez_id,role_id)", + }, + }, # .db.snp_entrez_role + "snp_biopolymer_role": { + "table": """ ( rs INTEGER NOT NULL, biopolymer_id INTEGER NOT NULL, @@ -368,19 +358,15 @@ def getDatabaseInterfaceVersion(cls): source_id TINYINT NOT NULL ) """, - 'index': { - 'snp_biopolymer_role__rs_biopolymer_role': '(rs,biopolymer_id,role_id)', - 'snp_biopolymer_role__biopolymer_rs_role': '(biopolymer_id,rs,role_id)', - } - }, #.db.snp_biopolymer_role - - - ################################################## - # biopolymer tables - - - 'biopolymer': { - 'table': """ + "index": { + "snp_biopolymer_role__rs_biopolymer_role": "(rs,biopolymer_id,role_id)", + "snp_biopolymer_role__biopolymer_rs_role": "(biopolymer_id,rs,role_id)", + }, + }, # .db.snp_biopolymer_role + ################################################## + # biopolymer tables + "biopolymer": { + "table": """ ( biopolymer_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, type_id TINYINT NOT NULL, @@ -389,15 +375,13 @@ def getDatabaseInterfaceVersion(cls): source_id TINYINT NOT NULL ) """, - 'index': { - 'biopolymer__type': '(type_id)', - 'biopolymer__label_type': '(label,type_id)', - } - }, #.db.biopolymer - - - 'biopolymer_name': { - 'table': """ + "index": { + "biopolymer__type": "(type_id)", + "biopolymer__label_type": "(label,type_id)", + }, + }, # .db.biopolymer + "biopolymer_name": { + "table": """ ( biopolymer_id INTEGER NOT NULL, namespace_id INTEGER NOT NULL, @@ -406,15 +390,13 @@ def getDatabaseInterfaceVersion(cls): PRIMARY KEY (biopolymer_id,namespace_id,name) ) """, - 'index': { - 'biopolymer_name__name_namespace_biopolymer': '(name,namespace_id,biopolymer_id)', - } - }, #.db.biopolymer_name - - - 'biopolymer_name_name': { - # PRIMARY KEY column order satisfies the need to GROUP BY new_namespace_id, new_name - 'table': """ + "index": { + "biopolymer_name__name_namespace_biopolymer": "(name,namespace_id,biopolymer_id)", + }, + }, # .db.biopolymer_name + "biopolymer_name_name": { + # PRIMARY KEY column order satisfies the need to GROUP BY new_namespace_id, new_name + "table": """ ( namespace_id INTEGER NOT NULL, name VARCHAR(256) NOT NULL, @@ -425,12 +407,10 @@ def getDatabaseInterfaceVersion(cls): PRIMARY KEY (new_namespace_id,new_name,type_id,namespace_id,name) ) """, - 'index': {} - }, #.db.biopolymer_name_name - - - 'biopolymer_region': { # all coordinates in LOKI are 1-based closed intervals - 'table': """ + "index": {}, + }, # .db.biopolymer_name_name + "biopolymer_region": { # all coordinates in LOKI are 1-based closed intervals + "table": """ ( biopolymer_id INTEGER NOT NULL, ldprofile_id INTEGER NOT NULL, @@ -441,15 +421,13 @@ def getDatabaseInterfaceVersion(cls): PRIMARY KEY (biopolymer_id,ldprofile_id,chr,posMin,posMax) ) """, - 'index': { - 'biopolymer_region__ldprofile_chr_min': '(ldprofile_id,chr,posMin)', - 'biopolymer_region__ldprofile_chr_max': '(ldprofile_id,chr,posMax)', - } - }, #.db.biopolymer_region - - - 'biopolymer_zone': { - 'table': """ + "index": { + "biopolymer_region__ldprofile_chr_min": "(ldprofile_id,chr,posMin)", + "biopolymer_region__ldprofile_chr_max": "(ldprofile_id,chr,posMax)", + }, + }, # .db.biopolymer_region + "biopolymer_zone": { + "table": """ ( biopolymer_id INTEGER NOT NULL, chr TINYINT NOT NULL, @@ -457,18 +435,14 @@ def getDatabaseInterfaceVersion(cls): PRIMARY KEY (biopolymer_id,chr,zone) ) """, - 'index': { - 'biopolymer_zone__zone': '(chr,zone,biopolymer_id)', - } - }, #.db.biopolymer_zone - - - ################################################## - # group tables - - - 'group': { - 'table': """ + "index": { + "biopolymer_zone__zone": "(chr,zone,biopolymer_id)", + }, + }, # .db.biopolymer_zone + ################################################## + # group tables + "group": { + "table": """ ( group_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, type_id TINYINT NOT NULL, @@ -478,16 +452,14 @@ def getDatabaseInterfaceVersion(cls): source_id TINYINT NOT NULL ) """, - 'index': { - 'group__type': '(type_id)', - 'group__subtype': '(subtype_id)', - 'group__label_type': '(label,type_id)', - } - }, #.db.group - - - 'group_name': { - 'table': """ + "index": { + "group__type": "(type_id)", + "group__subtype": "(subtype_id)", + "group__label_type": "(label,type_id)", + }, + }, # .db.group + "group_name": { + "table": """ ( group_id INTEGER NOT NULL, namespace_id INTEGER NOT NULL, @@ -496,15 +468,13 @@ def getDatabaseInterfaceVersion(cls): PRIMARY KEY (group_id,namespace_id,name) ) """, - 'index': { - 'group_name__name_namespace_group': '(name,namespace_id,group_id)', - 'group_name__source_name': '(source_id,name)', - } - }, #.db.group_name - - - 'group_group': { - 'table': """ + "index": { + "group_name__name_namespace_group": "(name,namespace_id,group_id)", + "group_name__source_name": "(source_id,name)", + }, + }, # .db.group_name + "group_group": { + "table": """ ( group_id INTEGER NOT NULL, related_group_id INTEGER NOT NULL, @@ -515,14 +485,12 @@ def getDatabaseInterfaceVersion(cls): PRIMARY KEY (group_id,related_group_id,relationship_id,direction) ) """, - 'index': { - 'group_group__related': '(related_group_id,group_id)', - } - }, #.db.group_group - - - 'group_biopolymer': { - 'table': """ + "index": { + "group_group__related": "(related_group_id,group_id)", + }, + }, # .db.group_group + "group_biopolymer": { + "table": """ ( group_id INTEGER NOT NULL, biopolymer_id INTEGER NOT NULL, @@ -533,14 +501,12 @@ def getDatabaseInterfaceVersion(cls): PRIMARY KEY (group_id,biopolymer_id,source_id) ) """, - 'index': { - 'group_biopolymer__biopolymer': '(biopolymer_id,group_id)', - } - }, #.db.group_biopolymer - - - 'group_member_name': { - 'table': """ + "index": { + "group_biopolymer__biopolymer": "(biopolymer_id,group_id)", + }, + }, # .db.group_biopolymer + "group_member_name": { + "table": """ ( group_id INTEGER NOT NULL, member INTEGER NOT NULL, @@ -551,16 +517,12 @@ def getDatabaseInterfaceVersion(cls): PRIMARY KEY (group_id,member,type_id,namespace_id,name) ) """, - 'index': {} - }, #.db.group_member_name - - - ################################################## - # gwas tables - - - 'gwas': { # all coordinates in LOKI are 1-based closed intervals - 'table': """ + "index": {}, + }, # .db.group_member_name + ################################################## + # gwas tables + "gwas": { # all coordinates in LOKI are 1-based closed intervals + "table": """ ( gwas_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, rs INTEGER, @@ -575,19 +537,15 @@ def getDatabaseInterfaceVersion(cls): source_id TINYINT NOT NULL ) """, - 'index': { - 'gwas__rs': '(rs)', - 'gwas__chr_pos': '(chr,pos)', - } - }, #.db.gwas - - - ################################################## - # liftover tables - - - 'chain': { # all coordinates in LOKI are 1-based closed intervals - 'table': """ + "index": { + "gwas__rs": "(rs)", + "gwas__chr_pos": "(chr,pos)", + }, + }, # .db.gwas + ################################################## + # liftover tables + "chain": { # all coordinates in LOKI are 1-based closed intervals + "table": """ ( chain_id INTEGER PRIMARY KEY AUTOINCREMENT NOT NULL, old_ucschg INTEGER NOT NULL, @@ -603,14 +561,12 @@ def getDatabaseInterfaceVersion(cls): source_id TINYINT NOT NULL ) """, - 'index': { - 'chain__oldhg_newhg_chr': '(old_ucschg,new_ucschg,old_chr)', - } - }, #.db.chain - - - 'chain_data': { # all coordinates in LOKI are 1-based closed intervals - 'table': """ + "index": { + "chain__oldhg_newhg_chr": "(old_ucschg,new_ucschg,old_chr)", + }, + }, # .db.chain + "chain_data": { # all coordinates in LOKI are 1-based closed intervals + "table": """ ( chain_id INTEGER NOT NULL, old_start BIGINT NOT NULL, @@ -620,1430 +576,1596 @@ def getDatabaseInterfaceVersion(cls): PRIMARY KEY (chain_id,old_start) ) """, - 'index': { - 'chain_data__end': '(chain_id,old_end)', - } - }, #.db.chain_data - - }, #.db - - } #_schema{} - - - ################################################## - # constructor - - - def __init__(self, dbFile=None, testing=False, updating=False, tempMem=False): - """ - Initializes a Database instance. - - Args: - dbFile (str, optional): The database file to attach. - testing (bool, optional): If True, runs in testing mode. - updating (bool, optional): If True, runs in updating mode. - tempMem (bool, optional): If True, uses memory for temporary storage. - """ - # initialize instance properties - self._is_test = testing - self._updating = updating - self._verbose = True - self._logger = None - self._logFile = sys.stderr - self._logIndent = 0 - self._logHanging = False - self._db = apsw.Connection('') - self._dbFile = None - self._dbNew = None - self._updater = None - self._liftOverCache = dict() # { (from,to) : [] } - - self.configureDatabase(tempMem=tempMem) - self.attachDatabaseFile(dbFile) - #__init__() - - - ################################################## - # context manager - - - def __enter__(self): - """ - Enters the context manager. - - Returns: - Connection: The APSW connection object. - """ - return self._db.__enter__() - #__enter__() - - - def __exit__(self, excType, excVal, traceback): - """ - Exits the context manager. - - Args: - excType (type): Exception type. - excVal (Exception): Exception value. - traceback (traceback): Traceback object. - - Returns: - bool: True if no exception occurred, otherwise False. - """ - return self._db.__exit__(excType, excVal, traceback) - #__exit__() - - - ################################################## - # logging - - - def _checkTesting(self): - """ - Checks and updates the testing setting in the database. - - Returns: - bool: True if testing settings match, otherwise False. - """ - now_test = self.getDatabaseSetting("testing") - if now_test is None or bool(int(now_test)) == bool(self._is_test): - self.setDatabaseSetting("testing", bool(self._is_test)) - return True - else: - return False - # setTesting(is_test) - - - def getVerbose(self): - """ - Gets the verbosity setting. - - Returns: - bool: True if verbose logging is enabled, otherwise False. - """ - return self._verbose - #getVerbose() - - - def setVerbose(self, verbose=True): - """ - Sets the verbosity setting. - - Args: - verbose (bool, optional): True to enable verbose logging, False to disable. - """ - self._verbose = verbose - #setVerbose() - - - def setLogger(self, logger=None): - """ - Sets the logger object. - - Args: - logger (Logger, optional): The logger object. - """ - self._logger = logger - #setLogger() - - - def log(self, message=""): - """ - Logs a message to the configured logger or standard output with indentation. - - Args: - message (str, optional): The message to log. Defaults to an empty string. - - Returns: - int: The current indentation level. - - The function logs the message with appropriate indentation and handles line breaks. - If a logger is set, it uses the logger to log the message. If verbose logging is enabled, - it writes the message to the standard output with indentation. - """ - if message != "" and message != "\n": - logtime = datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") - message = logtime + " " + message - - if self._logger: - return self._logger.log(message) - if self._verbose: - if (self._logIndent > 0) and (not self._logHanging): - self._logFile.write(self._logIndent * " ") - self._logHanging = True - self._logFile.write(message) - if (message == "") or (message[-1] != "\n"): - self._logHanging = True - self._logFile.flush() - else: - self._logHanging = False - return self._logIndent - #log() - - - def logPush(self, message=None): - """ - Logs a message and increases the indentation level. - - Args: - message (str, optional): The message to log. Defaults to None. - - Returns: - int: The new indentation level. - - The function logs the message if provided and increases the indentation level for subsequent logs. - If a logger is set, it uses the logger to log the message. - """ - - if self._logger: - return self._logger.logPush(message) - if message: - self.log(message) - if self._logHanging: - self.log("\n") - self._logIndent += 1 - return self._logIndent - #logPush() - - - def logPop(self, message=None): - """ - Decreases the indentation level and logs a message. - - Args: - message (str, optional): The message to log. Defaults to None. - - Returns: - int: The new indentation level. - - The function decreases the indentation level and logs the message if provided. - If a logger is set, it uses the logger to log the message. - """ - - if self._logger: - return self._logger.logPop(message) - if self._logHanging: - self.log("\n") - self._logIndent = max(0, self._logIndent - 1) - if message: - self.log(message) - return self._logIndent - #logPop() - - - ################################################## - # database management - - - def getDatabaseMemoryUsage(self, resetPeak=False): - """ - Retrieves the current and peak memory usage of the database. - - Args: - resetPeak (bool, optional): If True, resets the peak memory usage after retrieving it. Defaults to False. - - Returns: - tuple: A tuple containing the current memory usage (int) and the peak memory usage (int) in bytes. - """ - return (apsw.memoryused(), apsw.memoryhighwater(resetPeak)) - #getDatabaseMemoryUsage() - - - def getDatabaseMemoryLimit(self): - """ - Retrieves the current memory limit for the database. - - Returns: - int: The current soft heap limit in bytes. - """ - return apsw.softheaplimit(-1) - #getDatabaseMemoryLimit() - - - def setDatabaseMemoryLimit(self, limit=0): - """ - Sets a new memory limit for the database. - - Args: - limit (int, optional): The new memory limit in bytes. Defaults to 0, which sets no limit. - """ - apsw.softheaplimit(limit) - #setDatabaseMemoryLimit() - - - def configureDatabase(self, db=None, tempMem=False): - """ - Configures database settings for performance and behavior. - - Args: - db (str, optional): The name of the database to configure. Defaults to None. - tempMem (bool, optional): If True, configures the temporary storage to use memory. Defaults to False. - - The function sets various PRAGMA settings to optimize performance for typical usage scenarios. - """ - cursor = self._db.cursor() - db = ("%s." % db) if db else "" - - # linux VFS doesn't usually report actual disk cluster size, - # so sqlite ends up using 1KB pages by default; we prefer 4KB - cursor.execute("PRAGMA %spage_size = 4096" % (db,)) - - # cache_size is pages if positive, kibibytes if negative; - # seems to only affect write performance - cursor.execute("PRAGMA %scache_size = -65536" % (db,)) - - # for typical read-only usage, synchronization behavior is moot anyway, - # and while updating we're not that worried about a power failure - # corrupting the database file since the user could just start the - # update over from the beginning; so, we'll take the performance gain - cursor.execute("PRAGMA %ssynchronous = OFF" % (db,)) - - # the journal isn't that big, so keeping it in memory is faster; the - # cost is that a system crash will corrupt the database rather than - # leaving it recoverable with the on-disk journal (a program crash - # should be fine since sqlite will rollback transactions before exiting) - cursor.execute("PRAGMA %sjournal_mode = MEMORY" % (db,)) - - # the temp store is used for all of sqlite's internal scratch space - # needs, such as the TEMP database, indexing, etc; keeping it in memory - # is much faster, but it can get quite large - if tempMem and not db: - cursor.execute("PRAGMA temp_store = MEMORY") - - # we want EXCLUSIVE while updating since the data shouldn't be read - # until ready and we want the performance gain; for normal read usage, - # NORMAL is better so multiple users can share a database file - cursor.execute("PRAGMA %slocking_mode = %s" % (db,("EXCLUSIVE" if self._updating else "NORMAL"))) - #configureDatabase() - - - def attachTempDatabase(self, db): - """ - Attaches a temporary database with the given name. - - Args: - db (str): The name of the temporary database to attach. - - The function first detaches any existing temporary database with the same name, then attaches a new one. - """ - cursor = self._db.cursor() - - # detach the current db, if any - try: - cursor.execute("DETACH DATABASE `%s`" % db) - except apsw.SQLError as e: - if not str(e).startswith('SQLError: no such database: '): - raise e - - # attach a new temp db - cursor.execute("ATTACH DATABASE '' AS `%s`" % db) - self.configureDatabase(db) - #attachTempDatabase() - - - def attachDatabaseFile(self, dbFile, quiet=False): - """ - Attaches a new database file and configures it. - - Args: - dbFile (str): The path to the database file to attach. - quiet (bool, optional): If True, suppresses log messages. Defaults to False. - - The function detaches any currently attached database file, then attaches the new one and configures it. - It also establishes or audits the database schema. - """ - cursor = self._db.cursor() - - # detach the current db file, if any - if self._dbFile and not quiet: - self.log("unloading knowledge database file '%s' ..." % self._dbFile) - try: - cursor.execute("DETACH DATABASE `db`") - except apsw.SQLError as e: - if not str(e).startswith('SQLError: no such database: '): - raise e - if self._dbFile and not quiet: - self.log("unloading knowledge database file completed\n") - - # reset db info - self._dbFile = None - self._dbNew = None - - # attach the new db file, if any - if dbFile: - if not quiet: - self.logPush("loading knowledge database file '%s' ..." % dbFile) - cursor.execute("ATTACH DATABASE ? AS `db`", (dbFile,)) - self._dbFile = dbFile - self._dbNew = (0 == max(row[0] for row in cursor.execute("SELECT COUNT(1) FROM `db`.`sqlite_master`"))) - self.configureDatabase('db') - - # establish or audit database schema - err_msg = "" - with self._db: - if self._dbNew: - self.createDatabaseObjects(None, 'db') - ok = True - else: - self.updateDatabaseSchema() - ok = self.auditDatabaseObjects(None, 'db') - if not ok: - err_msg = "Audit of database failed" - - if ok and self._updating: - ok = self._checkTesting() - if not ok: - err_msg = "Testing settings do not match loaded database" - - if ok: - if not quiet: - self.logPop("loading knowledge database file completed\n") - else: - self._dbFile = None - self._dbNew = None - cursor.execute("DETACH DATABASE `db`") - if not quiet: - self.logPop("... ERROR (" + err_msg + ")\n") - #if new dbFile - #attachDatabaseFile() - - - def detachDatabaseFile(self, quiet=False): - """ - Detaches the currently attached database file. - - Args: - quiet (bool, optional): If True, suppresses log messages. Defaults to False. - - Returns: - None - """ - return self.attachDatabaseFile(None, quiet=quiet) - #detachDatabaseFile() - - - def testDatabaseWriteable(self): - """ - Tests if the current database file is writeable. - - Raises: - Exception: If no database file is loaded or if the database is read-only. - - Returns: - bool: True if the database file is writeable. - """ - if self._dbFile == None: - raise Exception("ERROR: no knowledge database file is loaded") - try: - if self._db.readonly('db'): - raise Exception("ERROR: knowledge database file cannot be modified") - except AttributeError: # apsw.Connection.readonly() added in 3.7.11 - try: - self._db.cursor().execute("UPDATE `db`.`setting` SET value = value") - except apsw.ReadOnlyError: - raise Exception("ERROR: knowledge database file cannot be modified") - return True - #testDatabaseWriteable() - - - def createDatabaseObjects(self, schema, dbName, tblList=None, doTables=True, idxList=None, doIndecies=True): - """ - Creates tables and indices in the database based on the provided schema. - - Args: - schema (dict): The schema definition for the database objects. - dbName (str): The name of the database to create objects in. - tblList (list, optional): List of tables to create. Defaults to None, which creates all tables in the schema. - doTables (bool, optional): If True, creates tables. Defaults to True. - idxList (list, optional): List of indices to create. Defaults to None, which creates all indices in the schema. - doIndecies (bool, optional): If True, creates indices. Defaults to True. - - The function creates the specified tables and indices, inserting initial data if provided in the schema. - """ - cursor = self._db.cursor() - schema = schema or self._schema[dbName] - dbType = "TEMP " if (dbName == "temp") else "" - if tblList and isinstance(tblList, str): - tblList = (tblList,) - if idxList and isinstance(idxList, str): - idxList = (idxList,) - for tblName in (tblList or schema.keys()): - if doTables: - cursor.execute("CREATE %sTABLE IF NOT EXISTS `%s`.`%s` %s" % (dbType, dbName, tblName, schema[tblName]['table'])) - if 'data' in schema[tblName] and schema[tblName]['data']: - sql = "INSERT OR IGNORE INTO `%s`.`%s` VALUES (%s)" % (dbName, tblName, ("?,"*len(schema[tblName]['data'][0]))[:-1]) - # TODO: change how 'data' is defined so it can be tested without having to try inserting - try: - cursor.executemany(sql, schema[tblName]['data']) - except apsw.ReadOnlyError: - pass - if doIndecies: - for idxName in (idxList or schema[tblName]['index'].keys()): - if idxName not in schema[tblName]['index']: - raise Exception("ERROR: no definition for index '%s' on table '%s'" % (idxName,tblName)) - cursor.execute("CREATE INDEX IF NOT EXISTS `%s`.`%s` ON `%s` %s" % (dbName, idxName, tblName, schema[tblName]['index'][idxName])) - #foreach idxName in idxList - cursor.execute("ANALYZE `%s`.`%s`" % (dbName,tblName)) - #foreach tblName in tblList - - # this shouldn't be necessary since we don't manually modify the sqlite_stat* tables - #if doIndecies: - # cursor.execute("ANALYZE `%s`.`sqlite_master`" % (dbName,)) - #createDatabaseObjects() - - - def createDatabaseTables(self, schema, dbName, tblList, doIndecies=False): - """ - Creates tables in the database based on the provided schema. - - Args: - schema (dict): The schema definition for the database objects. - dbName (str): The name of the database to create tables in. - tblList (list): List of tables to create. - doIndecies (bool, optional): If True, creates indices. Defaults to False. - - The function creates the specified tables and optionally creates indices for them. - """ - return self.createDatabaseObjects(schema, dbName, tblList, True, None, doIndecies) - #createDatabaseTables() - - - def createDatabaseIndices(self, schema, dbName, tblList, doTables=False, idxList=None): - """ - Creates indices in the database based on the provided schema. - - Args: - schema (dict): The schema definition for the database objects. - dbName (str): The name of the database to create indices in. - tblList (list): List of tables to create indices for. - doTables (bool, optional): If True, creates tables as well. Defaults to False. - idxList (list, optional): List of indices to create. Defaults to None, which creates all indices in the schema. - - The function creates the specified indices and optionally creates tables for them. - """ - return self.createDatabaseObjects(schema, dbName, tblList, doTables, idxList, True) - #createDatabaseIndices() - - - def dropDatabaseObjects(self, schema, dbName, tblList=None, doTables=True, idxList=None, doIndecies=True): - """ - Drops tables and indices in the database based on the provided schema. - - Args: - schema (dict): The schema definition for the database objects. - dbName (str): The name of the database to drop objects from. - tblList (list, optional): List of tables to drop. Defaults to None, which drops all tables in the schema. - doTables (bool, optional): If True, drops tables. Defaults to True. - idxList (list, optional): List of indices to drop. Defaults to None, which drops all indices in the schema. - doIndecies (bool, optional): If True, drops indices. Defaults to True. - - The function drops the specified tables and indices from the database. - """ - cursor = self._db.cursor() - schema = schema or self._schema[dbName] - if tblList and isinstance(tblList, str): - tblList = (tblList,) - if idxList and isinstance(idxList, str): - idxList = (idxList,) - for tblName in (tblList or schema.keys()): - if doTables: - cursor.execute("DROP TABLE IF EXISTS `%s`.`%s`" % (dbName, tblName)) - elif doIndecies: - for idxName in (idxList or schema[tblName]['index'].keys()): - cursor.execute("DROP INDEX IF EXISTS `%s`.`%s`" % (dbName, idxName)) - #foreach idxName in idxList - #foreach tblName in tblList - #dropDatabaseObjects() - - - def dropDatabaseTables(self, schema, dbName, tblList): - """ - Drops tables in the database based on the provided schema. - - Args: - schema (dict): The schema definition for the database objects. - dbName (str): The name of the database to drop tables from. - tblList (list): List of tables to drop. - - The function drops the specified tables from the database. - """ - return self.dropDatabaseObjects(schema, dbName, tblList, True, None, True) - #dropDatabaseTables() - - - def dropDatabaseIndices(self, schema, dbName, tblList, idxList=None): - """ - Drops indices in the database based on the provided schema. - - Args: - schema (dict): The schema definition for the database objects. - dbName (str): The name of the database to drop indices from. - tblList (list): List of tables to drop indices for. - idxList (list, optional): List of indices to drop. Defaults to None, which drops all indices in the schema. - - The function drops the specified indices from the database. - """ - return self.dropDatabaseObjects(schema, dbName, tblList, False, idxList, True) - #dropDatabaseIndices() - - - def updateDatabaseSchema(self): - """ - Updates the database schema to the latest version. - - The function checks the current schema version and applies necessary updates to bring it to the latest version. - It logs the progress and results of each update step. - - Raises: - Exception: If an error occurs during the schema update process. - """ - cursor = self._db.cursor() - - if self.getDatabaseSetting('schema',int) < 2: - self.logPush("updating database schema to version 2 ...\n") - updateMap = { - 'snp_merge' : 'rsMerged,rsCurrent,source_id', - 'snp_locus' : 'rs,chr,pos,validated,source_id', - 'snp_entrez_role' : 'rs,entrez_id,role_id,source_id', - 'snp_biopolymer_role' : 'rs,biopolymer_id,role_id,source_id', - } - for tblName,tblColumns in updateMap.iteritems(): - self.log("%s ..." % (tblName,)) - cursor.execute("ALTER TABLE `db`.`%s` RENAME TO `___old_%s___`" % (tblName,tblName)) - self.createDatabaseTables(None, 'db', tblName) - cursor.execute("INSERT INTO `db`.`%s` (%s) SELECT %s FROM `db`.`___old_%s___`" % (tblName,tblColumns,tblColumns,tblName)) - cursor.execute("DROP TABLE `db`.`___old_%s___`" % (tblName,)) - self.createDatabaseIndices(None, 'db', tblName) - self.log(" OK\n") - self.setDatabaseSetting('schema', 2) - self.logPop("... OK\n") - #schema<2 - - if self.getDatabaseSetting('schema',int) < 3: - self.log("updating database schema to version 3 ...") - self.setDatabaseSetting('optimized', self.getDatabaseSetting('finalized',int)) - self.setDatabaseSetting('schema', 3) - self.log(" OK\n") - #schema<3 - #updateDatabaseSchema() - - - def auditDatabaseObjects(self, schema, dbName, tblList=None, doTables=True, idxList=None, doIndecies=True, doRepair=True): - """ - Audits the database objects against the provided schema and repairs discrepancies if specified. - - Args: - schema (dict, optional): The schema definition for the database objects. Defaults to None, which uses the internal schema. - dbName (str): The name of the database to audit. - tblList (list, optional): List of tables to audit. Defaults to None, which audits all tables in the schema. - doTables (bool, optional): If True, audits tables. Defaults to True. - idxList (list, optional): List of indices to audit. Defaults to None, which audits all indices in the schema. - doIndecies (bool, optional): If True, audits indices. Defaults to True. - doRepair (bool, optional): If True, repairs discrepancies. Defaults to True. - - Returns: - bool: True if the audit is successful and all objects match the schema, False otherwise. - - The function fetches the current database schema, compares it with the provided schema, and repairs any discrepancies if specified. - It logs warnings and errors for mismatches and repairs. - """ - # fetch current schema - cursor = self._db.cursor() - current = dict() - dbMaster = "`sqlite_temp_master`" if (dbName == "temp") else ("`%s`.`sqlite_master`" % (dbName,)) - sql = "SELECT tbl_name,type,name,COALESCE(sql,'') FROM %s WHERE type IN ('table','index')" % (dbMaster,) - for row in cursor.execute(sql): - tblName,objType,idxName,objDef = row - if tblName not in current: - current[tblName] = {'table':None, 'index':{}} - if objType == 'table': - current[tblName]['table'] = " ".join(objDef.strip().split()) - elif objType == 'index': - current[tblName]['index'][idxName] = " ".join(objDef.strip().split()) - tblEmpty = dict() - sql = None - for tblName in current: - tblEmpty[tblName] = True - sql = "SELECT 1 FROM `%s`.`%s` LIMIT 1" % (dbName,tblName) - for row in cursor.execute(sql): - tblEmpty[tblName] = False - # audit requested objects - schema = schema or self._schema[dbName] - if tblList and isinstance(tblList, str): - tblList = (tblList,) - if idxList and isinstance(idxList, str): - idxList = (idxList,) - ok = True - for tblName in (tblList or schema.keys()): - if doTables: - if tblName in current: - if current[tblName]['table'] == ("CREATE TABLE `%s` %s" % (tblName, " ".join(schema[tblName]['table'].strip().split()))): - if 'data' in schema[tblName] and schema[tblName]['data']: - sql = u"INSERT OR IGNORE INTO `%s`.`%s` VALUES (%s)" % (dbName, tblName, ("?,"*len(schema[tblName]['data'][0]))[:-1]) - # TODO: change how 'data' is defined so it can be tested without having to try inserting - try: - cursor.executemany(sql, schema[tblName]['data']) - except apsw.ReadOnlyError: - pass - elif doRepair and tblEmpty[tblName]: - self.log("WARNING: table '%s' schema mismatch -- repairing ..." % tblName) - self.dropDatabaseTables(schema, dbName, tblName) - self.createDatabaseTables(schema, dbName, tblName) - current[tblName]['index'] = dict() - self.log(" OK\n") - elif doRepair: - self.log("ERROR: table '%s' schema mismatch -- cannot repair\n" % tblName) - ok = False - else: - self.log("ERROR: table '%s' schema mismatch\n" % tblName) - ok = False - #if definition match - elif doRepair: - self.log("WARNING: table '%s' is missing -- repairing ..." % tblName) - self.createDatabaseTables(schema, dbName, tblName, doIndecies) - self.log(" OK\n") - else: - self.log("ERROR: table '%s' is missing\n" % tblName) - ok = False - #if tblName in current - #if doTables - if doIndecies: - for idxName in (idxList or schema[tblName]['index'].keys()): - if (tblName not in current) and not (doTables and doRepair): - self.log("ERROR: table '%s' is missing for index '%s'\n" % (tblName, idxName)) - ok = False - elif tblName in current and idxName in current[tblName]['index']: - if current[tblName]['index'][idxName] == ("CREATE INDEX `%s` ON `%s` %s" % (idxName, tblName, " ".join(schema[tblName]['index'][idxName].strip().split()))): - pass - elif doRepair: - self.log("WARNING: index '%s' on table '%s' schema mismatch -- repairing ..." % (idxName, tblName)) - self.dropDatabaseIndices(schema, dbName, tblName, idxName) - self.createDatabaseIndices(schema, dbName, tblName, False, idxName) - self.log(" OK\n") - else: - self.log("ERROR: index '%s' on table '%s' schema mismatch\n" % (idxName, tblName)) - ok = False - #if definition match - elif doRepair: - self.log("WARNING: index '%s' on table '%s' is missing -- repairing ..." % (idxName, tblName)) - self.createDatabaseIndices(schema, dbName, tblName, False, idxName) - self.log(" OK\n") - else: - self.log("ERROR: index '%s' on table '%s' is missing\n" % (idxName, tblName)) - ok = False - #if tblName,idxName in current - #foreach idxName in idxList - #if doIndecies - #foreach tblName in tblList - return ok - #auditDatabaseObjects() - - - def finalizeDatabase(self): - """ - Finalizes the database by discarding intermediate data and setting finalization flags. - - The function drops intermediate tables, recreates them, and sets the database settings to indicate that the database is finalized and not optimized. - - Returns: - None - """ - self.log("discarding intermediate data ...") - self.dropDatabaseTables(None, 'db', ('snp_entrez_role','biopolymer_name_name','group_member_name')) - self.createDatabaseTables(None, 'db', ('snp_entrez_role','biopolymer_name_name','group_member_name'), True) - self.log(" OK\n") - self.setDatabaseSetting('finalized', 1) - self.setDatabaseSetting('optimized', 0) - #finalizeDatabase() - - - def optimizeDatabase(self): - """ - Optimizes the database by updating optimizer statistics and compacting the database file. - - The function updates the database statistics for query optimization and compacts the database to free up space. - - Returns: - None - """ - self._db.cursor().execute("ANALYZE `db`") - self.log("updating optimizer statistics completed\n") - self.defragmentDatabase() - self.setDatabaseSetting('optimized', 1) - self.log("compacting knowledge database file completed\n") - #optimizeDatabase() - - - def defragmentDatabase(self): - """ - Defragments the database to compact it and free up space. - - The function detaches the current database file, performs a VACUUM operation to compact it, and then re-attaches the database file. - - Returns: - None - """ - # unfortunately sqlite's VACUUM doesn't work on attached databases, - # so we have to detach, make a new direct connection, then re-attach - if self._dbFile: - dbFile = self._dbFile - self.detachDatabaseFile(quiet=True) - db = apsw.Connection(dbFile) - db.cursor().execute("VACUUM") - db.close() - self.attachDatabaseFile(dbFile, quiet=True) - #defragmentDatabase() - - - def getDatabaseSetting(self, setting, type=None): - """ - Retrieves a specific setting value from the database. - - Args: - setting (str): The name of the setting to retrieve. - type (type, optional): The type to cast the setting value to. Defaults to None. - - Returns: - The setting value, cast to the specified type if provided. - """ - value = None - if self._dbFile: - for row in self._db.cursor().execute("SELECT value FROM `db`.`setting` WHERE setting = ?", (setting,)): - value = row[0] - if type: - value = type(value) if (value != None) else type() - return value - #getDatabaseSetting() - - - def setDatabaseSetting(self, setting, value): - """ - Sets a specific setting value in the database. - - Args: - setting (str): The name of the setting to set. - value: The value to set for the specified setting. - - Returns: - None - """ - self._db.cursor().execute("INSERT OR REPLACE INTO `db`.`setting` (setting, value) VALUES (?, ?)", (setting,value)) - #setDatabaseSetting() - - - def getSourceModules(self): - """ - Retrieves the source modules available for updating the database. - - If the updater is not already initialized, it imports and initializes the updater module. - - Returns: - list: A list of available source modules. - """ - if not self._updater: - import loki.loki_updater as loki_updater - self._updater = loki_updater.Updater(self, self._is_test) - return self._updater.getSourceModules() - #getSourceModules() - - - def getSourceModuleVersions(self, sources=None): - """ - Retrieves the versions of the specified source modules. - - If the updater is not already initialized, it imports and initializes the updater module. - - Args: - sources (list, optional): A list of source modules to get versions for. Defaults to None, which retrieves versions for all modules. - - Returns: - dict: A dictionary mapping source modules to their versions. - """ - if not self._updater: - import loki.loki_updater as loki_updater - self._updater = loki_updater.Updater(self, self._is_test) - return self._updater.getSourceModuleVersions(sources) - #getSourceModuleVersions() - - - def getSourceModuleOptions(self, sources=None): - """ - Retrieves the options for the specified source modules. - - If the updater is not already initialized, it imports and initializes the updater module. - - Args: - sources (list, optional): A list of source modules to get options for. Defaults to None, which retrieves options for all modules. - - Returns: - dict: A dictionary mapping source modules to their options. - """ - if not self._updater: - import loki.loki_updater as loki_updater - self._updater = loki_updater.Updater(self, self._is_test) - return self._updater.getSourceModuleOptions(sources) - #getSourceModuleOptions() - - - def updateDatabase(self, sources=None, sourceOptions=None, cacheOnly=False, forceUpdate=False): - """ - Updates the database using the specified source modules and options. - - If the updater is not already initialized, it imports and initializes the updater module. - - Args: - sources (list, optional): A list of source modules to update from. Defaults to None, which updates from all sources. - sourceOptions (dict, optional): A dictionary of options for the source modules. Defaults to None. - cacheOnly (bool, optional): If True, only updates the cache. Defaults to False. - forceUpdate (bool, optional): If True, forces the update even if not necessary. Defaults to False. - - Returns: - Any: The result of the update operation. - - Raises: - Exception: If the database is finalized and cannot be updated. - """ - if self.getDatabaseSetting('finalized',int): - raise Exception("ERROR: cannot update a finalized database") - if not self._updater: - import loki.loki_updater as loki_updater - self._updater = loki_updater.Updater(self, self._is_test) - return self._updater.updateDatabase(sources, sourceOptions, cacheOnly, forceUpdate) - #updateDatabase() - - - def prepareTableForUpdate(self, table): - """ - Prepares a table for update by the updater. - - If the database is finalized, it raises an exception. - - Args: - table (str): The name of the table to prepare for update. - - Returns: - Any: The result of the preparation. - - Raises: - Exception: If the database is finalized and cannot be updated. - """ - if self.getDatabaseSetting('finalized',int): - raise Exception("ERROR: cannot update a finalized database") - if self._updater: - return self._updater.prepareTableForUpdate(table) - return None - #prepareTableForUpdate() - - - def prepareTableForQuery(self, table): - """ - Prepares a table for query by the updater. - - Args: - table (str): The name of the table to prepare for query. - - Returns: - Any: The result of the preparation, or None if no updater is available. - """ - if self._updater: - return self._updater.prepareTableForQuery(table) - return None - #prepareTableForQuery() - - - ################################################## - # metadata retrieval - - - def generateGRChByUCSChg(self, ucschg): - """ - Generates GRCh values based on a given UCSC chain identifier. - - Args: - ucschg (str): The UCSC chain identifier. - - Returns: - generator: A generator yielding GRCh values corresponding to the given UCSC chain identifier. - """ - return (row[0] for row in self._db.cursor().execute("SELECT grch FROM grch_ucschg WHERE ucschg = ?", (ucschg,))) - #generateGRChByUCSChg() - - - def getUCSChgByGRCh(self, grch): - """ - Retrieves the UCSC chain identifier for a given GRCh value. - - Args: - grch (str): The GRCh value. - - Returns: - str: The UCSC chain identifier corresponding to the given GRCh value, or None if not found. - """ - ucschg = None - for row in self._db.cursor().execute("SELECT ucschg FROM grch_ucschg WHERE grch = ?", (grch,)): - ucschg = row[0] - return ucschg - #getUCSChgByGRCh() - - - def getLDProfileID(self, ldprofile): - """ - Retrieves the identifier for a given LD profile. - - Args: - ldprofile (str): The LD profile name. - - Returns: - int: The identifier of the LD profile, or None if not found. - """ - return self.getLDProfileIDs([ldprofile])[ldprofile] - #getLDProfileID() - - - def getLDProfileIDs(self, ldprofiles): - """ - Retrieves the identifiers for a list of LD profiles. - - Args: - ldprofiles (list): A list of LD profile names. - - Returns: - dict: A dictionary mapping LD profile names to their identifiers. - """ - if not self._dbFile: - return { l:None for l in ldprofiles } - sql = "SELECT i.ldprofile, l.ldprofile_id FROM (SELECT ? AS ldprofile) AS i LEFT JOIN `db`.`ldprofile` AS l ON LOWER(TRIM(l.ldprofile)) = LOWER(TRIM(i.ldprofile))" - with self._db: - ret = { row[0]:row[1] for row in self._db.cursor().executemany(sql, zip(ldprofiles)) } - return ret - #getLDProfileIDs() - - - def getLDProfiles(self, ldprofiles=None): - """ - Retrieves detailed information about LD profiles. - - Args: - ldprofiles (list, optional): A list of LD profile names. Defaults to None, which retrieves information for all profiles. - - Returns: - dict: A dictionary mapping LD profile names to a tuple containing their identifier, description, metric, and value. - """ - if not self._dbFile: - return { l:None for l in (ldprofiles or list()) } - with self._db: - if ldprofiles: - sql = "SELECT i.ldprofile, l.ldprofile_id, l.description, l.metric, l.value FROM (SELECT ? AS ldprofile) AS i LEFT JOIN `db`.`ldprofile` AS l ON LOWER(TRIM(l.ldprofile)) = LOWER(TRIM(i.ldprofile))" - ret = { row[0]:row[1:] for row in self._db.cursor().executemany(sql, zip(ldprofiles)) } - else: - sql = "SELECT l.ldprofile, l.ldprofile_id, l.description, l.metric, l.value FROM `db`.`ldprofile` AS l" - ret = { row[0]:row[1:] for row in self._db.cursor().execute(sql) } - return ret - #getLDProfiles() - - - def getNamespaceID(self, namespace): - """ - Retrieves the identifier for a given namespace. - - Args: - namespace (str): The namespace name. - - Returns: - int: The identifier of the namespace, or None if not found. - """ - return self.getNamespaceIDs([namespace])[namespace] - #getNamespaceID() - - - def getNamespaceIDs(self, namespaces): - """ - Retrieves the identifiers for a list of namespaces. - - Args: - namespaces (list): A list of namespace names. - - Returns: - dict: A dictionary mapping namespace names to their identifiers. - """ - if not self._dbFile: - return { n:None for n in namespaces } - sql = "SELECT i.namespace, n.namespace_id FROM (SELECT ? AS namespace) AS i LEFT JOIN `db`.`namespace` AS n ON n.namespace = LOWER(i.namespace)" - with self._db: - ret = { row[0]:row[1] for row in self._db.cursor().executemany(sql, zip(namespaces)) } - return ret - #getNamespaceIDs() - - - def getRelationshipID(self, relationship): - """ - Retrieves the identifier for a given relationship. - - Args: - relationship (str): The relationship name. - - Returns: - int: The identifier of the relationship, or None if not found. - """ - return self.getRelationshipIDs([relationship])[relationship] - #getRelationshipID() - - - def getRelationshipIDs(self, relationships): - """ - Retrieves the identifiers for a list of relationships. - - Args: - relationships (list): A list of relationship names. - - Returns: - dict: A dictionary mapping relationship names to their identifiers. - """ - if not self._dbFile: - return { r:None for r in relationships } - sql = "SELECT i.relationship, r.relationship_id FROM (SELECT ? AS relationship) AS i LEFT JOIN `db`.`relationship` AS r ON r.relationship = LOWER(i.relationship)" - with self._db: - ret = { row[0]:row[1] for row in self._db.cursor().executemany(sql, zip(relationships)) } - return ret - #getRelationshipIDs() - - - def getRoleID(self, role): - """ - Retrieves the identifier for a given role. - - Args: - role (str): The role name. - - Returns: - int: The identifier of the role, or None if not found. - """ - return self.getRoleIDs([role])[role] - #getRoleID() - - - def getRoleIDs(self, roles): - """ - Retrieves the identifiers for a list of roles. - - Args: - roles (list): A list of role names. - - Returns: - dict: A dictionary mapping role names to their identifiers. - """ - if not self._dbFile: - return { r:None for r in roles } - sql = "SELECT i.role, role_id FROM (SELECT ? AS role) AS i LEFT JOIN `db`.`role` AS r ON r.role = LOWER(i.role)" - with self._db: - ret = { row[0]:row[1] for row in self._db.cursor().executemany(sql, zip(roles)) } - return ret - #getRoleIDs() - - - def getSourceID(self, source): - """ - Retrieves the identifier for a given data source. - - Args: - source (str): The name of the data source. - - Returns: - int: The identifier of the data source, or None if not found. - """ - return self.getSourceIDs([source])[source] - #getSourceID() - - - def getSourceIDs(self, sources=None): - """ - Retrieves the identifiers for a list of data sources. - - Args: - sources (list, optional): A list of data source names. Defaults to None, which retrieves information for all sources. - - Returns: - dict: A dictionary mapping data source names to their identifiers. - """ - if not self._dbFile: - return { s:None for s in (sources or list()) } - if sources: - sql = "SELECT i.source, s.source_id FROM (SELECT ? AS source) AS i LEFT JOIN `db`.`source` AS s ON s.source = LOWER(i.source)" - with self._db: - ret = { row[0]:row[1] for row in self._db.cursor().executemany(sql, zip(sources)) } - else: - sql = "SELECT source, source_id FROM `db`.`source`" - with self._db: - ret = { row[0]:row[1] for row in self._db.cursor().execute(sql) } - return ret - #getSourceIDs() - - - def getSourceIDVersion(self, sourceID): - """ - Retrieves the version of a data source given its identifier. - - Args: - sourceID (int): The identifier of the data source. - - Returns: - str: The version of the data source, or None if not found. - """ - sql = "SELECT version FROM `db`.`source` WHERE source_id = ?" - ret = None - with self._db: - for row in self._db.cursor().execute(sql, (sourceID,)): - ret = row[0] - return ret - #getSourceIDVersion() - - - def getSourceIDOptions(self, sourceID): - """ - Retrieves the options associated with a data source given its identifier. - - Args: - sourceID (int): The identifier of the data source. - - Returns: - dict: A dictionary mapping option names to their values for the given data source. - """ - sql = "SELECT option, value FROM `db`.`source_option` WHERE source_id = ?" - with self._db: - ret = { row[0]:row[1] for row in self._db.cursor().execute(sql, (sourceID,)) } - return ret - #getSourceIDOptions() - - - def getSourceIDFiles(self, sourceID): - """ - Retrieves information about files associated with a data source given its identifier. - - Args: - sourceID (int): The identifier of the data source. - - Returns: - dict: A dictionary mapping filenames to tuples containing their modified date, size, and md5 hash. - """ - sql = "SELECT filename, COALESCE(modified,''), COALESCE(size,''), COALESCE(md5,'') FROM `db`.`source_file` WHERE source_id = ?" - with self._db: - ret = { row[0]:tuple(row[1:]) for row in self._db.cursor().execute(sql, (sourceID,)) } - return ret - #getSourceIDFiles() - - - def getTypeID(self, type): - """ - Retrieves the identifier for a given type. - - Args: - type (str): The name of the type. - - Returns: - int: The identifier of the type, or None if not found. - """ - return self.getTypeIDs([type])[type] - #getTypeID() - - - def getTypeIDs(self, types): - """ - Retrieves the identifiers for a list of types. - - Args: - types (list): A list of type names. - - Returns: - dict: A dictionary mapping type names to their identifiers. - """ - if not self._dbFile: - return { t:None for t in types } - sql = "SELECT i.type, t.type_id FROM (SELECT ? AS type) AS i LEFT JOIN `db`.`type` AS t ON t.type = LOWER(i.type)" - with self._db: - ret = { row[0]:row[1] for row in self._db.cursor().executemany(sql, zip(types)) } - return ret - #getTypeIDs() - - def getSubtypeID(self, subtype): - """ - Retrieves the identifier for a given subtype. - - Args: - subtype (str): The name of the subtype. - - Returns: - int: The identifier of the subtype, or None if not found. - """ - return self.getSubtypeIDs([subtype])[subtype] - #getSubtypeID() - - - def getSubtypeIDs(self, subtypes): - """ - Retrieves subtype IDs for given subtype names from the database. - - Args: - subtypes (list): A list of subtype names. - - Returns: - dict: A dictionary where keys are subtype names and values are their corresponding subtype IDs. - If a subtype is not found in the database, its value in the dictionary will be None. - """ - if not self._dbFile: - return { t:None for t in subtypes } - sql = "SELECT i.subtype, t.subtype_id FROM (SELECT ? AS subtype) AS i LEFT JOIN `db`.`subtype` AS t ON t.subtype = LOWER(i.subtype)" - with self._db: - ret = { row[0]:row[1] for row in self._db.cursor().executemany(sql, zip(subtypes)) } - return ret - #getSubtypeIDs() - - ################################################## - # snp data retrieval - - - def generateCurrentRSesByRSes(self, rses, tally=None): - """ - Generates current RS IDs by merging RS IDs from the database. - - Args: - rses (list): A list of tuples, where each tuple contains (rsMerged, extra). - tally (dict, optional): A dictionary to store tally counts for 'merge' and 'match'. Defaults to None. - - Yields: - tuple: A tuple containing (rsMerged, extra, rsCurrent). - """ - # rses=[ (rsInput,extra), ... ] - # tally=dict() - # yield:[ (rsInput,extra,rsCurrent), ... ] - sql = """ + "index": { + "chain_data__end": "(chain_id,old_end)", + }, + }, # .db.chain_data + }, # .db + } # _schema{} + + ################################################## + # constructor + + def __init__(self, dbFile=None, testing=False, updating=False, tempMem=False): + """ + Initializes a Database instance. + + Args: + dbFile (str, optional): The database file to attach. + testing (bool, optional): If True, runs in testing mode. + updating (bool, optional): If True, runs in updating mode. + tempMem (bool, optional): If True, uses memory for temporary storage. + """ + # initialize instance properties + self._is_test = testing + self._updating = updating + self._verbose = True + self._logger = None + self._logFile = sys.stderr + self._logIndent = 0 + self._logHanging = False + self._db = apsw.Connection("") + self._dbFile = None + self._dbNew = None + self._updater = None + self._liftOverCache = dict() # { (from,to) : [] } + + self.configureDatabase(tempMem=tempMem) + self.attachDatabaseFile(dbFile) + + # __init__() + + ################################################## + # context manager + + def __enter__(self): + """ + Enters the context manager. + + Returns: + Connection: The APSW connection object. + """ + return self._db.__enter__() + + # __enter__() + + def __exit__(self, excType, excVal, traceback): + """ + Exits the context manager. + + Args: + excType (type): Exception type. + excVal (Exception): Exception value. + traceback (traceback): Traceback object. + + Returns: + bool: True if no exception occurred, otherwise False. + """ + return self._db.__exit__(excType, excVal, traceback) + + # __exit__() + + ################################################## + # logging + + def _checkTesting(self): + """ + Checks and updates the testing setting in the database. + + Returns: + bool: True if testing settings match, otherwise False. + """ + now_test = self.getDatabaseSetting("testing") + if now_test is None or bool(int(now_test)) == bool(self._is_test): + self.setDatabaseSetting("testing", bool(self._is_test)) + return True + else: + return False + + # setTesting(is_test) + + def getVerbose(self): + """ + Gets the verbosity setting. + + Returns: + bool: True if verbose logging is enabled, otherwise False. + """ + return self._verbose + + # getVerbose() + + def setVerbose(self, verbose=True): + """ + Sets the verbosity setting. + + Args: + verbose (bool, optional): True to enable verbose logging, False to disable. + """ + self._verbose = verbose + + # setVerbose() + + def setLogger(self, logger=None): + """ + Sets the logger object. + + Args: + logger (Logger, optional): The logger object. + """ + self._logger = logger + + # setLogger() + + def log(self, message=""): + """ + Logs a message to the configured logger or standard output with indentation. + + Args: + message (str, optional): The message to log. Defaults to an empty string. + + Returns: + int: The current indentation level. + + The function logs the message with appropriate indentation and handles line breaks. + If a logger is set, it uses the logger to log the message. If verbose logging is enabled, + it writes the message to the standard output with indentation. + """ + if message != "" and message != "\n": + logtime = datetime.datetime.now().strftime("%d.%b %Y %H:%M:%S") + message = logtime + " " + message + + if self._logger: + return self._logger.log(message) + if self._verbose: + if (self._logIndent > 0) and (not self._logHanging): + self._logFile.write(self._logIndent * " ") + self._logHanging = True + self._logFile.write(message) + if (message == "") or (message[-1] != "\n"): + self._logHanging = True + self._logFile.flush() + else: + self._logHanging = False + return self._logIndent + + # log() + + def logPush(self, message=None): + """ + Logs a message and increases the indentation level. + + Args: + message (str, optional): The message to log. Defaults to None. + + Returns: + int: The new indentation level. + + The function logs the message if provided and increases the indentation level for subsequent logs. + If a logger is set, it uses the logger to log the message. + """ + + if self._logger: + return self._logger.logPush(message) + if message: + self.log(message) + if self._logHanging: + self.log("\n") + self._logIndent += 1 + return self._logIndent + + # logPush() + + def logPop(self, message=None): + """ + Decreases the indentation level and logs a message. + + Args: + message (str, optional): The message to log. Defaults to None. + + Returns: + int: The new indentation level. + + The function decreases the indentation level and logs the message if provided. + If a logger is set, it uses the logger to log the message. + """ + + if self._logger: + return self._logger.logPop(message) + if self._logHanging: + self.log("\n") + self._logIndent = max(0, self._logIndent - 1) + if message: + self.log(message) + return self._logIndent + + # logPop() + + ################################################## + # database management + + def getDatabaseMemoryUsage(self, resetPeak=False): + """ + Retrieves the current and peak memory usage of the database. + + Args: + resetPeak (bool, optional): If True, resets the peak memory usage after retrieving it. Defaults to False. + + Returns: + tuple: A tuple containing the current memory usage (int) and the peak memory usage (int) in bytes. + """ + return (apsw.memoryused(), apsw.memoryhighwater(resetPeak)) + + # getDatabaseMemoryUsage() + + def getDatabaseMemoryLimit(self): + """ + Retrieves the current memory limit for the database. + + Returns: + int: The current soft heap limit in bytes. + """ + return apsw.softheaplimit(-1) + + # getDatabaseMemoryLimit() + + def setDatabaseMemoryLimit(self, limit=0): + """ + Sets a new memory limit for the database. + + Args: + limit (int, optional): The new memory limit in bytes. Defaults to 0, which sets no limit. + """ + apsw.softheaplimit(limit) + + # setDatabaseMemoryLimit() + + def configureDatabase(self, db=None, tempMem=False): + """ + Configures database settings for performance and behavior. + + Args: + db (str, optional): The name of the database to configure. Defaults to None. + tempMem (bool, optional): If True, configures the temporary storage to use memory. Defaults to False. + + The function sets various PRAGMA settings to optimize performance for typical usage scenarios. + """ + cursor = self._db.cursor() + db = ("%s." % db) if db else "" + + # linux VFS doesn't usually report actual disk cluster size, + # so sqlite ends up using 1KB pages by default; we prefer 4KB + cursor.execute("PRAGMA %spage_size = 4096" % (db,)) + + # cache_size is pages if positive, kibibytes if negative; + # seems to only affect write performance + cursor.execute("PRAGMA %scache_size = -65536" % (db,)) + + # for typical read-only usage, synchronization behavior is moot anyway, + # and while updating we're not that worried about a power failure + # corrupting the database file since the user could just start the + # update over from the beginning; so, we'll take the performance gain + cursor.execute("PRAGMA %ssynchronous = OFF" % (db,)) + + # the journal isn't that big, so keeping it in memory is faster; the + # cost is that a system crash will corrupt the database rather than + # leaving it recoverable with the on-disk journal (a program crash + # should be fine since sqlite will rollback transactions before exiting) + cursor.execute("PRAGMA %sjournal_mode = MEMORY" % (db,)) + + # the temp store is used for all of sqlite's internal scratch space + # needs, such as the TEMP database, indexing, etc; keeping it in memory + # is much faster, but it can get quite large + if tempMem and not db: + cursor.execute("PRAGMA temp_store = MEMORY") + + # we want EXCLUSIVE while updating since the data shouldn't be read + # until ready and we want the performance gain; for normal read usage, + # NORMAL is better so multiple users can share a database file + cursor.execute( + "PRAGMA %slocking_mode = %s" + % (db, ("EXCLUSIVE" if self._updating else "NORMAL")) + ) + + # configureDatabase() + + def attachTempDatabase(self, db): + """ + Attaches a temporary database with the given name. + + Args: + db (str): The name of the temporary database to attach. + + The function first detaches any existing temporary database with the same name, then attaches a new one. + """ + cursor = self._db.cursor() + + # detach the current db, if any + try: + cursor.execute("DETACH DATABASE `%s`" % db) + except apsw.SQLError as e: + if not str(e).startswith("SQLError: no such database: "): + raise e + + # attach a new temp db + cursor.execute("ATTACH DATABASE '' AS `%s`" % db) + self.configureDatabase(db) + + # attachTempDatabase() + + def attachDatabaseFile(self, dbFile, quiet=False): + """ + Attaches a new database file and configures it. + + Args: + dbFile (str): The path to the database file to attach. + quiet (bool, optional): If True, suppresses log messages. Defaults to False. + + The function detaches any currently attached database file, then attaches the new one and configures it. + It also establishes or audits the database schema. + """ + cursor = self._db.cursor() + + # detach the current db file, if any + if self._dbFile and not quiet: + self.log("unloading knowledge database file '%s' ..." % self._dbFile) + try: + cursor.execute("DETACH DATABASE `db`") + except apsw.SQLError as e: + if not str(e).startswith("SQLError: no such database: "): + raise e + if self._dbFile and not quiet: + self.log("unloading knowledge database file completed\n") + + # reset db info + self._dbFile = None + self._dbNew = None + + # attach the new db file, if any + if dbFile: + if not quiet: + self.logPush("loading knowledge database file '%s' ..." % dbFile) + cursor.execute("ATTACH DATABASE ? AS `db`", (dbFile,)) + self._dbFile = dbFile + self._dbNew = 0 == max( + row[0] + for row in cursor.execute("SELECT COUNT(1) FROM `db`.`sqlite_master`") + ) + self.configureDatabase("db") + + # establish or audit database schema + err_msg = "" + with self._db: + if self._dbNew: + self.createDatabaseObjects(None, "db") + ok = True + else: + self.updateDatabaseSchema() + ok = self.auditDatabaseObjects(None, "db") + if not ok: + err_msg = "Audit of database failed" + + if ok and self._updating: + ok = self._checkTesting() + if not ok: + err_msg = "Testing settings do not match loaded database" + + if ok: + if not quiet: + self.logPop("loading knowledge database file completed\n") + else: + self._dbFile = None + self._dbNew = None + cursor.execute("DETACH DATABASE `db`") + if not quiet: + self.logPop("... ERROR (" + err_msg + ")\n") + # if new dbFile + + # attachDatabaseFile() + + def detachDatabaseFile(self, quiet=False): + """ + Detaches the currently attached database file. + + Args: + quiet (bool, optional): If True, suppresses log messages. Defaults to False. + + Returns: + None + """ + return self.attachDatabaseFile(None, quiet=quiet) + + # detachDatabaseFile() + + def testDatabaseWriteable(self): + """ + Tests if the current database file is writeable. + + Raises: + Exception: If no database file is loaded or if the database is read-only. + + Returns: + bool: True if the database file is writeable. + """ + if self._dbFile == None: + raise Exception("ERROR: no knowledge database file is loaded") + try: + if self._db.readonly("db"): + raise Exception("ERROR: knowledge database file cannot be modified") + except AttributeError: # apsw.Connection.readonly() added in 3.7.11 + try: + self._db.cursor().execute("UPDATE `db`.`setting` SET value = value") + except apsw.ReadOnlyError: + raise Exception("ERROR: knowledge database file cannot be modified") + return True + + # testDatabaseWriteable() + + def createDatabaseObjects( + self, schema, dbName, tblList=None, doTables=True, idxList=None, doIndecies=True + ): + """ + Creates tables and indices in the database based on the provided schema. + + Args: + schema (dict): The schema definition for the database objects. + dbName (str): The name of the database to create objects in. + tblList (list, optional): List of tables to create. Defaults to None, which creates all tables in the schema. + doTables (bool, optional): If True, creates tables. Defaults to True. + idxList (list, optional): List of indices to create. Defaults to None, which creates all indices in the schema. + doIndecies (bool, optional): If True, creates indices. Defaults to True. + + The function creates the specified tables and indices, inserting initial data if provided in the schema. + """ + cursor = self._db.cursor() + schema = schema or self._schema[dbName] + dbType = "TEMP " if (dbName == "temp") else "" + if tblList and isinstance(tblList, str): + tblList = (tblList,) + if idxList and isinstance(idxList, str): + idxList = (idxList,) + for tblName in tblList or schema.keys(): + if doTables: + cursor.execute( + "CREATE %sTABLE IF NOT EXISTS `%s`.`%s` %s" + % (dbType, dbName, tblName, schema[tblName]["table"]) + ) + if "data" in schema[tblName] and schema[tblName]["data"]: + sql = "INSERT OR IGNORE INTO `%s`.`%s` VALUES (%s)" % ( + dbName, + tblName, + ("?," * len(schema[tblName]["data"][0]))[:-1], + ) + # TODO: change how 'data' is defined so it can be tested without having to try inserting + try: + cursor.executemany(sql, schema[tblName]["data"]) + except apsw.ReadOnlyError: + pass + if doIndecies: + for idxName in idxList or schema[tblName]["index"].keys(): + if idxName not in schema[tblName]["index"]: + raise Exception( + "ERROR: no definition for index '%s' on table '%s'" + % (idxName, tblName) + ) + cursor.execute( + "CREATE INDEX IF NOT EXISTS `%s`.`%s` ON `%s` %s" + % (dbName, idxName, tblName, schema[tblName]["index"][idxName]) + ) + # foreach idxName in idxList + cursor.execute("ANALYZE `%s`.`%s`" % (dbName, tblName)) + # foreach tblName in tblList + + # this shouldn't be necessary since we don't manually modify the sqlite_stat* tables + # if doIndecies: + # cursor.execute("ANALYZE `%s`.`sqlite_master`" % (dbName,)) + + # createDatabaseObjects() + + def createDatabaseTables(self, schema, dbName, tblList, doIndecies=False): + """ + Creates tables in the database based on the provided schema. + + Args: + schema (dict): The schema definition for the database objects. + dbName (str): The name of the database to create tables in. + tblList (list): List of tables to create. + doIndecies (bool, optional): If True, creates indices. Defaults to False. + + The function creates the specified tables and optionally creates indices for them. + """ + return self.createDatabaseObjects( + schema, dbName, tblList, True, None, doIndecies + ) + + # createDatabaseTables() + + def createDatabaseIndices( + self, schema, dbName, tblList, doTables=False, idxList=None + ): + """ + Creates indices in the database based on the provided schema. + + Args: + schema (dict): The schema definition for the database objects. + dbName (str): The name of the database to create indices in. + tblList (list): List of tables to create indices for. + doTables (bool, optional): If True, creates tables as well. Defaults to False. + idxList (list, optional): List of indices to create. Defaults to None, which creates all indices in the schema. + + The function creates the specified indices and optionally creates tables for them. + """ + return self.createDatabaseObjects( + schema, dbName, tblList, doTables, idxList, True + ) + + # createDatabaseIndices() + + def dropDatabaseObjects( + self, schema, dbName, tblList=None, doTables=True, idxList=None, doIndecies=True + ): + """ + Drops tables and indices in the database based on the provided schema. + + Args: + schema (dict): The schema definition for the database objects. + dbName (str): The name of the database to drop objects from. + tblList (list, optional): List of tables to drop. Defaults to None, which drops all tables in the schema. + doTables (bool, optional): If True, drops tables. Defaults to True. + idxList (list, optional): List of indices to drop. Defaults to None, which drops all indices in the schema. + doIndecies (bool, optional): If True, drops indices. Defaults to True. + + The function drops the specified tables and indices from the database. + """ + cursor = self._db.cursor() + schema = schema or self._schema[dbName] + if tblList and isinstance(tblList, str): + tblList = (tblList,) + if idxList and isinstance(idxList, str): + idxList = (idxList,) + for tblName in tblList or schema.keys(): + if doTables: + cursor.execute("DROP TABLE IF EXISTS `%s`.`%s`" % (dbName, tblName)) + elif doIndecies: + for idxName in idxList or schema[tblName]["index"].keys(): + cursor.execute("DROP INDEX IF EXISTS `%s`.`%s`" % (dbName, idxName)) + # foreach idxName in idxList + # foreach tblName in tblList + + # dropDatabaseObjects() + + def dropDatabaseTables(self, schema, dbName, tblList): + """ + Drops tables in the database based on the provided schema. + + Args: + schema (dict): The schema definition for the database objects. + dbName (str): The name of the database to drop tables from. + tblList (list): List of tables to drop. + + The function drops the specified tables from the database. + """ + return self.dropDatabaseObjects(schema, dbName, tblList, True, None, True) + + # dropDatabaseTables() + + def dropDatabaseIndices(self, schema, dbName, tblList, idxList=None): + """ + Drops indices in the database based on the provided schema. + + Args: + schema (dict): The schema definition for the database objects. + dbName (str): The name of the database to drop indices from. + tblList (list): List of tables to drop indices for. + idxList (list, optional): List of indices to drop. Defaults to None, which drops all indices in the schema. + + The function drops the specified indices from the database. + """ + return self.dropDatabaseObjects(schema, dbName, tblList, False, idxList, True) + + # dropDatabaseIndices() + + def updateDatabaseSchema(self): + """ + Updates the database schema to the latest version. + + The function checks the current schema version and applies necessary updates to bring it to the latest version. + It logs the progress and results of each update step. + + Raises: + Exception: If an error occurs during the schema update process. + """ + cursor = self._db.cursor() + + if self.getDatabaseSetting("schema", int) < 2: + self.logPush("updating database schema to version 2 ...\n") + updateMap = { + "snp_merge": "rsMerged,rsCurrent,source_id", + "snp_locus": "rs,chr,pos,validated,source_id", + "snp_entrez_role": "rs,entrez_id,role_id,source_id", + "snp_biopolymer_role": "rs,biopolymer_id,role_id,source_id", + } + for tblName, tblColumns in updateMap.iteritems(): + self.log("%s ..." % (tblName,)) + cursor.execute( + "ALTER TABLE `db`.`%s` RENAME TO `___old_%s___`" + % (tblName, tblName) + ) + self.createDatabaseTables(None, "db", tblName) + cursor.execute( + "INSERT INTO `db`.`%s` (%s) SELECT %s FROM `db`.`___old_%s___`" + % (tblName, tblColumns, tblColumns, tblName) + ) + cursor.execute("DROP TABLE `db`.`___old_%s___`" % (tblName,)) + self.createDatabaseIndices(None, "db", tblName) + self.log(" OK\n") + self.setDatabaseSetting("schema", 2) + self.logPop("... OK\n") + # schema<2 + + if self.getDatabaseSetting("schema", int) < 3: + self.log("updating database schema to version 3 ...") + self.setDatabaseSetting( + "optimized", self.getDatabaseSetting("finalized", int) + ) + self.setDatabaseSetting("schema", 3) + self.log(" OK\n") + # schema<3 + + # updateDatabaseSchema() + + def auditDatabaseObjects( + self, + schema, + dbName, + tblList=None, + doTables=True, + idxList=None, + doIndecies=True, + doRepair=True, + ): + """ + Audits the database objects against the provided schema and repairs discrepancies if specified. + + Args: + schema (dict, optional): The schema definition for the database objects. Defaults to None, which uses the internal schema. + dbName (str): The name of the database to audit. + tblList (list, optional): List of tables to audit. Defaults to None, which audits all tables in the schema. + doTables (bool, optional): If True, audits tables. Defaults to True. + idxList (list, optional): List of indices to audit. Defaults to None, which audits all indices in the schema. + doIndecies (bool, optional): If True, audits indices. Defaults to True. + doRepair (bool, optional): If True, repairs discrepancies. Defaults to True. + + Returns: + bool: True if the audit is successful and all objects match the schema, False otherwise. + + The function fetches the current database schema, compares it with the provided schema, and repairs any discrepancies if specified. + It logs warnings and errors for mismatches and repairs. + """ + # fetch current schema + cursor = self._db.cursor() + current = dict() + dbMaster = ( + "`sqlite_temp_master`" + if (dbName == "temp") + else ("`%s`.`sqlite_master`" % (dbName,)) + ) + sql = ( + "SELECT tbl_name,type,name,COALESCE(sql,'') FROM %s WHERE type IN ('table','index')" + % (dbMaster,) + ) + for row in cursor.execute(sql): + tblName, objType, idxName, objDef = row + if tblName not in current: + current[tblName] = {"table": None, "index": {}} + if objType == "table": + current[tblName]["table"] = " ".join(objDef.strip().split()) + elif objType == "index": + current[tblName]["index"][idxName] = " ".join(objDef.strip().split()) + tblEmpty = dict() + sql = None + for tblName in current: + tblEmpty[tblName] = True + sql = "SELECT 1 FROM `%s`.`%s` LIMIT 1" % (dbName, tblName) + for row in cursor.execute(sql): + tblEmpty[tblName] = False + # audit requested objects + schema = schema or self._schema[dbName] + if tblList and isinstance(tblList, str): + tblList = (tblList,) + if idxList and isinstance(idxList, str): + idxList = (idxList,) + ok = True + for tblName in tblList or schema.keys(): + if doTables: + if tblName in current: + if current[tblName]["table"] == ( + "CREATE TABLE `%s` %s" + % (tblName, " ".join(schema[tblName]["table"].strip().split())) + ): + if "data" in schema[tblName] and schema[tblName]["data"]: + sql = "INSERT OR IGNORE INTO `%s`.`%s` VALUES (%s)" % ( + dbName, + tblName, + ("?," * len(schema[tblName]["data"][0]))[:-1], + ) + # TODO: change how 'data' is defined so it can be tested without having to try inserting + try: + cursor.executemany(sql, schema[tblName]["data"]) + except apsw.ReadOnlyError: + pass + elif doRepair and tblEmpty[tblName]: + self.log( + "WARNING: table '%s' schema mismatch -- repairing ..." + % tblName + ) + self.dropDatabaseTables(schema, dbName, tblName) + self.createDatabaseTables(schema, dbName, tblName) + current[tblName]["index"] = dict() + self.log(" OK\n") + elif doRepair: + self.log( + "ERROR: table '%s' schema mismatch -- cannot repair\n" + % tblName + ) + ok = False + else: + self.log("ERROR: table '%s' schema mismatch\n" % tblName) + ok = False + # if definition match + elif doRepair: + self.log( + "WARNING: table '%s' is missing -- repairing ..." % tblName + ) + self.createDatabaseTables(schema, dbName, tblName, doIndecies) + self.log(" OK\n") + else: + self.log("ERROR: table '%s' is missing\n" % tblName) + ok = False + # if tblName in current + # if doTables + if doIndecies: + for idxName in idxList or schema[tblName]["index"].keys(): + if (tblName not in current) and not (doTables and doRepair): + self.log( + "ERROR: table '%s' is missing for index '%s'\n" + % (tblName, idxName) + ) + ok = False + elif tblName in current and idxName in current[tblName]["index"]: + if current[tblName]["index"][idxName] == ( + "CREATE INDEX `%s` ON `%s` %s" + % ( + idxName, + tblName, + " ".join( + schema[tblName]["index"][idxName].strip().split() + ), + ) + ): + pass + elif doRepair: + self.log( + "WARNING: index '%s' on table '%s' schema mismatch -- repairing ..." + % (idxName, tblName) + ) + self.dropDatabaseIndices(schema, dbName, tblName, idxName) + self.createDatabaseIndices( + schema, dbName, tblName, False, idxName + ) + self.log(" OK\n") + else: + self.log( + "ERROR: index '%s' on table '%s' schema mismatch\n" + % (idxName, tblName) + ) + ok = False + # if definition match + elif doRepair: + self.log( + "WARNING: index '%s' on table '%s' is missing -- repairing ..." + % (idxName, tblName) + ) + self.createDatabaseIndices( + schema, dbName, tblName, False, idxName + ) + self.log(" OK\n") + else: + self.log( + "ERROR: index '%s' on table '%s' is missing\n" + % (idxName, tblName) + ) + ok = False + # if tblName,idxName in current + # foreach idxName in idxList + # if doIndecies + # foreach tblName in tblList + return ok + + # auditDatabaseObjects() + + def finalizeDatabase(self): + """ + Finalizes the database by discarding intermediate data and setting finalization flags. + + The function drops intermediate tables, recreates them, and sets the database settings to indicate that the database is finalized and not optimized. + + Returns: + None + """ + self.log("discarding intermediate data ...") + self.dropDatabaseTables( + None, "db", ("snp_entrez_role", "biopolymer_name_name", "group_member_name") + ) + self.createDatabaseTables( + None, + "db", + ("snp_entrez_role", "biopolymer_name_name", "group_member_name"), + True, + ) + self.log(" OK\n") + self.setDatabaseSetting("finalized", 1) + self.setDatabaseSetting("optimized", 0) + + # finalizeDatabase() + + def optimizeDatabase(self): + """ + Optimizes the database by updating optimizer statistics and compacting the database file. + + The function updates the database statistics for query optimization and compacts the database to free up space. + + Returns: + None + """ + self._db.cursor().execute("ANALYZE `db`") + self.log("updating optimizer statistics completed\n") + self.defragmentDatabase() + self.setDatabaseSetting("optimized", 1) + self.log("compacting knowledge database file completed\n") + + # optimizeDatabase() + + def defragmentDatabase(self): + """ + Defragments the database to compact it and free up space. + + The function detaches the current database file, performs a VACUUM operation to compact it, and then re-attaches the database file. + + Returns: + None + """ + # unfortunately sqlite's VACUUM doesn't work on attached databases, + # so we have to detach, make a new direct connection, then re-attach + if self._dbFile: + dbFile = self._dbFile + self.detachDatabaseFile(quiet=True) + db = apsw.Connection(dbFile) + db.cursor().execute("VACUUM") + db.close() + self.attachDatabaseFile(dbFile, quiet=True) + + # defragmentDatabase() + + def getDatabaseSetting(self, setting, type=None): + """ + Retrieves a specific setting value from the database. + + Args: + setting (str): The name of the setting to retrieve. + type (type, optional): The type to cast the setting value to. Defaults to None. + + Returns: + The setting value, cast to the specified type if provided. + """ + value = None + if self._dbFile: + for row in self._db.cursor().execute( + "SELECT value FROM `db`.`setting` WHERE setting = ?", (setting,) + ): + value = row[0] + if type: + value = type(value) if (value != None) else type() + return value + + # getDatabaseSetting() + + def setDatabaseSetting(self, setting, value): + """ + Sets a specific setting value in the database. + + Args: + setting (str): The name of the setting to set. + value: The value to set for the specified setting. + + Returns: + None + """ + self._db.cursor().execute( + "INSERT OR REPLACE INTO `db`.`setting` (setting, value) VALUES (?, ?)", + (setting, value), + ) + + # setDatabaseSetting() + + def getSourceModules(self): + """ + Retrieves the source modules available for updating the database. + + If the updater is not already initialized, it imports and initializes the updater module. + + Returns: + list: A list of available source modules. + """ + if not self._updater: + import loki.loki_updater as loki_updater + + self._updater = loki_updater.Updater(self, self._is_test) + return self._updater.getSourceModules() + + # getSourceModules() + + def getSourceModuleVersions(self, sources=None): + """ + Retrieves the versions of the specified source modules. + + If the updater is not already initialized, it imports and initializes the updater module. + + Args: + sources (list, optional): A list of source modules to get versions for. Defaults to None, which retrieves versions for all modules. + + Returns: + dict: A dictionary mapping source modules to their versions. + """ + if not self._updater: + import loki.loki_updater as loki_updater + + self._updater = loki_updater.Updater(self, self._is_test) + return self._updater.getSourceModuleVersions(sources) + + # getSourceModuleVersions() + + def getSourceModuleOptions(self, sources=None): + """ + Retrieves the options for the specified source modules. + + If the updater is not already initialized, it imports and initializes the updater module. + + Args: + sources (list, optional): A list of source modules to get options for. Defaults to None, which retrieves options for all modules. + + Returns: + dict: A dictionary mapping source modules to their options. + """ + if not self._updater: + import loki.loki_updater as loki_updater + + self._updater = loki_updater.Updater(self, self._is_test) + return self._updater.getSourceModuleOptions(sources) + + # getSourceModuleOptions() + + def updateDatabase( + self, sources=None, sourceOptions=None, cacheOnly=False, forceUpdate=False + ): + """ + Updates the database using the specified source modules and options. + + If the updater is not already initialized, it imports and initializes the updater module. + + Args: + sources (list, optional): A list of source modules to update from. Defaults to None, which updates from all sources. + sourceOptions (dict, optional): A dictionary of options for the source modules. Defaults to None. + cacheOnly (bool, optional): If True, only updates the cache. Defaults to False. + forceUpdate (bool, optional): If True, forces the update even if not necessary. Defaults to False. + + Returns: + Any: The result of the update operation. + + Raises: + Exception: If the database is finalized and cannot be updated. + """ + if self.getDatabaseSetting("finalized", int): + raise Exception("ERROR: cannot update a finalized database") + if not self._updater: + import loki.loki_updater as loki_updater + + self._updater = loki_updater.Updater(self, self._is_test) + return self._updater.updateDatabase( + sources, sourceOptions, cacheOnly, forceUpdate + ) + + # updateDatabase() + + def prepareTableForUpdate(self, table): + """ + Prepares a table for update by the updater. + + If the database is finalized, it raises an exception. + + Args: + table (str): The name of the table to prepare for update. + + Returns: + Any: The result of the preparation. + + Raises: + Exception: If the database is finalized and cannot be updated. + """ + if self.getDatabaseSetting("finalized", int): + raise Exception("ERROR: cannot update a finalized database") + if self._updater: + return self._updater.prepareTableForUpdate(table) + return None + + # prepareTableForUpdate() + + def prepareTableForQuery(self, table): + """ + Prepares a table for query by the updater. + + Args: + table (str): The name of the table to prepare for query. + + Returns: + Any: The result of the preparation, or None if no updater is available. + """ + if self._updater: + return self._updater.prepareTableForQuery(table) + return None + + # prepareTableForQuery() + + ################################################## + # metadata retrieval + + def generateGRChByUCSChg(self, ucschg): + """ + Generates GRCh values based on a given UCSC chain identifier. + + Args: + ucschg (str): The UCSC chain identifier. + + Returns: + generator: A generator yielding GRCh values corresponding to the given UCSC chain identifier. + """ + return ( + row[0] + for row in self._db.cursor().execute( + "SELECT grch FROM grch_ucschg WHERE ucschg = ?", (ucschg,) + ) + ) + + # generateGRChByUCSChg() + + def getUCSChgByGRCh(self, grch): + """ + Retrieves the UCSC chain identifier for a given GRCh value. + + Args: + grch (str): The GRCh value. + + Returns: + str: The UCSC chain identifier corresponding to the given GRCh value, or None if not found. + """ + ucschg = None + for row in self._db.cursor().execute( + "SELECT ucschg FROM grch_ucschg WHERE grch = ?", (grch,) + ): + ucschg = row[0] + return ucschg + + # getUCSChgByGRCh() + + def getLDProfileID(self, ldprofile): + """ + Retrieves the identifier for a given LD profile. + + Args: + ldprofile (str): The LD profile name. + + Returns: + int: The identifier of the LD profile, or None if not found. + """ + return self.getLDProfileIDs([ldprofile])[ldprofile] + + # getLDProfileID() + + def getLDProfileIDs(self, ldprofiles): + """ + Retrieves the identifiers for a list of LD profiles. + + Args: + ldprofiles (list): A list of LD profile names. + + Returns: + dict: A dictionary mapping LD profile names to their identifiers. + """ + if not self._dbFile: + return {l: None for l in ldprofiles} + sql = "SELECT i.ldprofile, l.ldprofile_id FROM (SELECT ? AS ldprofile) AS i LEFT JOIN `db`.`ldprofile` AS l ON LOWER(TRIM(l.ldprofile)) = LOWER(TRIM(i.ldprofile))" + with self._db: + ret = { + row[0]: row[1] + for row in self._db.cursor().executemany(sql, zip(ldprofiles)) + } + return ret + + # getLDProfileIDs() + + def getLDProfiles(self, ldprofiles=None): + """ + Retrieves detailed information about LD profiles. + + Args: + ldprofiles (list, optional): A list of LD profile names. Defaults to None, which retrieves information for all profiles. + + Returns: + dict: A dictionary mapping LD profile names to a tuple containing their identifier, description, metric, and value. + """ + if not self._dbFile: + return {l: None for l in (ldprofiles or list())} + with self._db: + if ldprofiles: + sql = "SELECT i.ldprofile, l.ldprofile_id, l.description, l.metric, l.value FROM (SELECT ? AS ldprofile) AS i LEFT JOIN `db`.`ldprofile` AS l ON LOWER(TRIM(l.ldprofile)) = LOWER(TRIM(i.ldprofile))" + ret = { + row[0]: row[1:] + for row in self._db.cursor().executemany(sql, zip(ldprofiles)) + } + else: + sql = "SELECT l.ldprofile, l.ldprofile_id, l.description, l.metric, l.value FROM `db`.`ldprofile` AS l" + ret = {row[0]: row[1:] for row in self._db.cursor().execute(sql)} + return ret + + # getLDProfiles() + + def getNamespaceID(self, namespace): + """ + Retrieves the identifier for a given namespace. + + Args: + namespace (str): The namespace name. + + Returns: + int: The identifier of the namespace, or None if not found. + """ + return self.getNamespaceIDs([namespace])[namespace] + + # getNamespaceID() + + def getNamespaceIDs(self, namespaces): + """ + Retrieves the identifiers for a list of namespaces. + + Args: + namespaces (list): A list of namespace names. + + Returns: + dict: A dictionary mapping namespace names to their identifiers. + """ + if not self._dbFile: + return {n: None for n in namespaces} + sql = "SELECT i.namespace, n.namespace_id FROM (SELECT ? AS namespace) AS i LEFT JOIN `db`.`namespace` AS n ON n.namespace = LOWER(i.namespace)" + with self._db: + ret = { + row[0]: row[1] + for row in self._db.cursor().executemany(sql, zip(namespaces)) + } + return ret + + # getNamespaceIDs() + + def getRelationshipID(self, relationship): + """ + Retrieves the identifier for a given relationship. + + Args: + relationship (str): The relationship name. + + Returns: + int: The identifier of the relationship, or None if not found. + """ + return self.getRelationshipIDs([relationship])[relationship] + + # getRelationshipID() + + def getRelationshipIDs(self, relationships): + """ + Retrieves the identifiers for a list of relationships. + + Args: + relationships (list): A list of relationship names. + + Returns: + dict: A dictionary mapping relationship names to their identifiers. + """ + if not self._dbFile: + return {r: None for r in relationships} + sql = "SELECT i.relationship, r.relationship_id FROM (SELECT ? AS relationship) AS i LEFT JOIN `db`.`relationship` AS r ON r.relationship = LOWER(i.relationship)" + with self._db: + ret = { + row[0]: row[1] + for row in self._db.cursor().executemany(sql, zip(relationships)) + } + return ret + + # getRelationshipIDs() + + def getRoleID(self, role): + """ + Retrieves the identifier for a given role. + + Args: + role (str): The role name. + + Returns: + int: The identifier of the role, or None if not found. + """ + return self.getRoleIDs([role])[role] + + # getRoleID() + + def getRoleIDs(self, roles): + """ + Retrieves the identifiers for a list of roles. + + Args: + roles (list): A list of role names. + + Returns: + dict: A dictionary mapping role names to their identifiers. + """ + if not self._dbFile: + return {r: None for r in roles} + sql = "SELECT i.role, role_id FROM (SELECT ? AS role) AS i LEFT JOIN `db`.`role` AS r ON r.role = LOWER(i.role)" + with self._db: + ret = { + row[0]: row[1] for row in self._db.cursor().executemany(sql, zip(roles)) + } + return ret + + # getRoleIDs() + + def getSourceID(self, source): + """ + Retrieves the identifier for a given data source. + + Args: + source (str): The name of the data source. + + Returns: + int: The identifier of the data source, or None if not found. + """ + return self.getSourceIDs([source])[source] + + # getSourceID() + + def getSourceIDs(self, sources=None): + """ + Retrieves the identifiers for a list of data sources. + + Args: + sources (list, optional): A list of data source names. Defaults to None, which retrieves information for all sources. + + Returns: + dict: A dictionary mapping data source names to their identifiers. + """ + if not self._dbFile: + return {s: None for s in (sources or list())} + if sources: + sql = "SELECT i.source, s.source_id FROM (SELECT ? AS source) AS i LEFT JOIN `db`.`source` AS s ON s.source = LOWER(i.source)" + with self._db: + ret = { + row[0]: row[1] + for row in self._db.cursor().executemany(sql, zip(sources)) + } + else: + sql = "SELECT source, source_id FROM `db`.`source`" + with self._db: + ret = {row[0]: row[1] for row in self._db.cursor().execute(sql)} + return ret + + # getSourceIDs() + + def getSourceIDVersion(self, sourceID): + """ + Retrieves the version of a data source given its identifier. + + Args: + sourceID (int): The identifier of the data source. + + Returns: + str: The version of the data source, or None if not found. + """ + sql = "SELECT version FROM `db`.`source` WHERE source_id = ?" + ret = None + with self._db: + for row in self._db.cursor().execute(sql, (sourceID,)): + ret = row[0] + return ret + + # getSourceIDVersion() + + def getSourceIDOptions(self, sourceID): + """ + Retrieves the options associated with a data source given its identifier. + + Args: + sourceID (int): The identifier of the data source. + + Returns: + dict: A dictionary mapping option names to their values for the given data source. + """ + sql = "SELECT option, value FROM `db`.`source_option` WHERE source_id = ?" + with self._db: + ret = { + row[0]: row[1] for row in self._db.cursor().execute(sql, (sourceID,)) + } + return ret + + # getSourceIDOptions() + + def getSourceIDFiles(self, sourceID): + """ + Retrieves information about files associated with a data source given its identifier. + + Args: + sourceID (int): The identifier of the data source. + + Returns: + dict: A dictionary mapping filenames to tuples containing their modified date, size, and md5 hash. + """ + sql = "SELECT filename, COALESCE(modified,''), COALESCE(size,''), COALESCE(md5,'') FROM `db`.`source_file` WHERE source_id = ?" + with self._db: + ret = { + row[0]: tuple(row[1:]) + for row in self._db.cursor().execute(sql, (sourceID,)) + } + return ret + + # getSourceIDFiles() + + def getTypeID(self, type): + """ + Retrieves the identifier for a given type. + + Args: + type (str): The name of the type. + + Returns: + int: The identifier of the type, or None if not found. + """ + return self.getTypeIDs([type])[type] + + # getTypeID() + + def getTypeIDs(self, types): + """ + Retrieves the identifiers for a list of types. + + Args: + types (list): A list of type names. + + Returns: + dict: A dictionary mapping type names to their identifiers. + """ + if not self._dbFile: + return {t: None for t in types} + sql = "SELECT i.type, t.type_id FROM (SELECT ? AS type) AS i LEFT JOIN `db`.`type` AS t ON t.type = LOWER(i.type)" + with self._db: + ret = { + row[0]: row[1] for row in self._db.cursor().executemany(sql, zip(types)) + } + return ret + + # getTypeIDs() + + def getSubtypeID(self, subtype): + """ + Retrieves the identifier for a given subtype. + + Args: + subtype (str): The name of the subtype. + + Returns: + int: The identifier of the subtype, or None if not found. + """ + return self.getSubtypeIDs([subtype])[subtype] + + # getSubtypeID() + + def getSubtypeIDs(self, subtypes): + """ + Retrieves subtype IDs for given subtype names from the database. + + Args: + subtypes (list): A list of subtype names. + + Returns: + dict: A dictionary where keys are subtype names and values are their corresponding subtype IDs. + If a subtype is not found in the database, its value in the dictionary will be None. + """ + if not self._dbFile: + return {t: None for t in subtypes} + sql = "SELECT i.subtype, t.subtype_id FROM (SELECT ? AS subtype) AS i LEFT JOIN `db`.`subtype` AS t ON t.subtype = LOWER(i.subtype)" + with self._db: + ret = { + row[0]: row[1] + for row in self._db.cursor().executemany(sql, zip(subtypes)) + } + return ret + + # getSubtypeIDs() + + ################################################## + # snp data retrieval + + def generateCurrentRSesByRSes(self, rses, tally=None): + """ + Generates current RS IDs by merging RS IDs from the database. + + Args: + rses (list): A list of tuples, where each tuple contains (rsMerged, extra). + tally (dict, optional): A dictionary to store tally counts for 'merge' and 'match'. Defaults to None. + + Yields: + tuple: A tuple containing (rsMerged, extra, rsCurrent). + """ + # rses=[ (rsInput,extra), ... ] + # tally=dict() + # yield:[ (rsInput,extra,rsCurrent), ... ] + sql = """ SELECT i.rsMerged, i.extra, COALESCE(sm.rsCurrent, i.rsMerged) AS rsCurrent FROM (SELECT ? AS rsMerged, ? AS extra) AS i LEFT JOIN `db`.`snp_merge` AS sm USING (rsMerged) """ - with self._db: - if tally != None: - numMerge = numMatch = 0 - for row in self._db.cursor().executemany(sql, rses): - if row[2] != row[0]: - numMerge += 1 - else: - numMatch += 1 - yield row - tally['merge'] = numMerge - tally['match'] = numMatch - else: - for row in self._db.cursor().executemany(sql, rses): - yield row - #generateCurrentRSesByRSes() - - - def generateSNPLociByRSes(self, rses, minMatch=1, maxMatch=1, validated=None, tally=None, errorCallback=None): - """ - Generates SNP loci by RS IDs from the database. - - Args: - rses (list): A list of tuples, where each tuple contains (rs, extra). - minMatch (int, optional): Minimum number of matches required. Defaults to 1. - maxMatch (int, optional): Maximum number of matches allowed. Defaults to 1. - validated (bool, optional): Flag to filter validated SNP loci. Defaults to None. - tally (dict, optional): A dictionary to store tally counts for 'zero', 'one', and 'many'. Defaults to None. - errorCallback (callable, optional): A callable function for error handling. Defaults to None. - - Yields: - tuple: A tuple containing (rs, extra, chr, pos) for each SNP locus. - """ - # rses=[ (rs,extra), ... ] - # tally=dict() - # yield:[ (rs,extra,chr,pos), ... ] - sql = """ + with self._db: + if tally != None: + numMerge = numMatch = 0 + for row in self._db.cursor().executemany(sql, rses): + if row[2] != row[0]: + numMerge += 1 + else: + numMatch += 1 + yield row + tally["merge"] = numMerge + tally["match"] = numMatch + else: + for row in self._db.cursor().executemany(sql, rses): + yield row + + # generateCurrentRSesByRSes() + + def generateSNPLociByRSes( + self, + rses, + minMatch=1, + maxMatch=1, + validated=None, + tally=None, + errorCallback=None, + ): + """ + Generates SNP loci by RS IDs from the database. + + Args: + rses (list): A list of tuples, where each tuple contains (rs, extra). + minMatch (int, optional): Minimum number of matches required. Defaults to 1. + maxMatch (int, optional): Maximum number of matches allowed. Defaults to 1. + validated (bool, optional): Flag to filter validated SNP loci. Defaults to None. + tally (dict, optional): A dictionary to store tally counts for 'zero', 'one', and 'many'. Defaults to None. + errorCallback (callable, optional): A callable function for error handling. Defaults to None. + + Yields: + tuple: A tuple containing (rs, extra, chr, pos) for each SNP locus. + """ + # rses=[ (rs,extra), ... ] + # tally=dict() + # yield:[ (rs,extra,chr,pos), ... ] + sql = """ SELECT i.rs, i.extra, sl.chr, sl.pos FROM (SELECT ? AS rs, ? AS extra) AS i LEFT JOIN `db`.`snp_locus` AS sl ON sl.rs = i.rs ORDER BY sl.chr, sl.pos """ - if validated != None: - sql += " AND sl.validated = %d" % (1 if validated else 0) - - minMatch = int(minMatch) if (minMatch != None) else 0 - maxMatch = int(maxMatch) if (maxMatch != None) else None - tag = matches = None - n = numZero = numOne = numMany = 0 - with self._db: - for row in itertools.chain(self._db.cursor().executemany(sql, rses), [(None,None,None,None)]): - if tag != row[0:2]: - if tag: - if not matches: - numZero += 1 - elif len(matches) == 1: - numOne += 1 - else: - numMany += 1 - - if minMatch <= len(matches) <= (maxMatch if (maxMatch != None) else len(matches)): - for match in (matches or [tag+(None,None)]): - yield match - elif errorCallback: - errorCallback("\t".join((t or "") for t in tag), "%s match%s at index %d" % ((len(matches) or "no"),("" if len(matches) == 1 else "es"),n)) - tag = row[0:2] - matches = list() - n += 1 - if row[2] and row[3]: - matches.append(row) - #foreach row - if tally != None: - tally['zero'] = numZero - tally['one'] = numOne - tally['many'] = numMany - #generateSNPLociByRSes() - - - ################################################## - # biopolymer data retrieval - - - def generateBiopolymersByIDs(self, ids): - """ - Generates biopolymers by their IDs from the database. - - Args: - ids (list): A list of tuples, where each tuple contains (id, extra). - - Yields: - tuple: A tuple containing (biopolymer_id, extra, type_id, label, description) for each biopolymer. - """ - # ids=[ (id,extra), ... ] - # yield:[ (id,extra,type_id,label,description), ... ] - sql = "SELECT biopolymer_id, ?2 AS extra, type_id, label, description FROM `db`.`biopolymer` WHERE biopolymer_id = ?1" - return self._db.cursor().executemany(sql, ids) - #generateBiopolymersByIDs() - - - def _lookupBiopolymerIDs(self, typeID, identifiers, minMatch, maxMatch, tally, errorCallback): - """ - Looks up biopolymer IDs based on identifiers from the database. - - Args: - typeID (int or Falseish): Type ID of the biopolymer, or Falseish for any type. - identifiers (list): A list of tuples, where each tuple contains (namespace, name, extra). - minMatch (int or Falseish): Minimum number of matches required, or Falseish for none. - maxMatch (int or Falseish): Maximum number of matches allowed, or Falseish for none. - tally (dict or None): A dictionary to store tally counts for 'zero', 'one', and 'many'. Defaults to None. - errorCallback (callable): A callable function for error handling. - - Yields: - tuple: A tuple containing (namespace, name, extra, id) for each matched biopolymer. - """ - # typeID=int or Falseish for any - # identifiers=[ (namespace,name,extra), ... ] - # namespace='' or '*' for any, '-' for labels, '=' for biopolymer_id - # minMatch=int or Falseish for none - # maxMatch=int or Falseish for none - # tally=dict() or None - # errorCallback=callable(position,input,error) - # yields (namespace,name,extra,id) - - sql = """ + if validated != None: + sql += " AND sl.validated = %d" % (1 if validated else 0) + + minMatch = int(minMatch) if (minMatch != None) else 0 + maxMatch = int(maxMatch) if (maxMatch != None) else None + tag = matches = None + n = numZero = numOne = numMany = 0 + with self._db: + for row in itertools.chain( + self._db.cursor().executemany(sql, rses), [(None, None, None, None)] + ): + if tag != row[0:2]: + if tag: + if not matches: + numZero += 1 + elif len(matches) == 1: + numOne += 1 + else: + numMany += 1 + + if ( + minMatch + <= len(matches) + <= (maxMatch if (maxMatch != None) else len(matches)) + ): + for match in matches or [tag + (None, None)]: + yield match + elif errorCallback: + errorCallback( + "\t".join((t or "") for t in tag), + "%s match%s at index %d" + % ( + (len(matches) or "no"), + ("" if len(matches) == 1 else "es"), + n, + ), + ) + tag = row[0:2] + matches = list() + n += 1 + if row[2] and row[3]: + matches.append(row) + # foreach row + if tally != None: + tally["zero"] = numZero + tally["one"] = numOne + tally["many"] = numMany + + # generateSNPLociByRSes() + + ################################################## + # biopolymer data retrieval + + def generateBiopolymersByIDs(self, ids): + """ + Generates biopolymers by their IDs from the database. + + Args: + ids (list): A list of tuples, where each tuple contains (id, extra). + + Yields: + tuple: A tuple containing (biopolymer_id, extra, type_id, label, description) for each biopolymer. + """ + # ids=[ (id,extra), ... ] + # yield:[ (id,extra,type_id,label,description), ... ] + sql = "SELECT biopolymer_id, ?2 AS extra, type_id, label, description FROM `db`.`biopolymer` WHERE biopolymer_id = ?1" + return self._db.cursor().executemany(sql, ids) + + # generateBiopolymersByIDs() + + def _lookupBiopolymerIDs( + self, typeID, identifiers, minMatch, maxMatch, tally, errorCallback + ): + """ + Looks up biopolymer IDs based on identifiers from the database. + + Args: + typeID (int or Falseish): Type ID of the biopolymer, or Falseish for any type. + identifiers (list): A list of tuples, where each tuple contains (namespace, name, extra). + minMatch (int or Falseish): Minimum number of matches required, or Falseish for none. + maxMatch (int or Falseish): Maximum number of matches allowed, or Falseish for none. + tally (dict or None): A dictionary to store tally counts for 'zero', 'one', and 'many'. Defaults to None. + errorCallback (callable): A callable function for error handling. + + Yields: + tuple: A tuple containing (namespace, name, extra, id) for each matched biopolymer. + """ + # typeID=int or Falseish for any + # identifiers=[ (namespace,name,extra), ... ] + # namespace='' or '*' for any, '-' for labels, '=' for biopolymer_id + # minMatch=int or Falseish for none + # maxMatch=int or Falseish for none + # tally=dict() or None + # errorCallback=callable(position,input,error) + # yields (namespace,name,extra,id) + + sql = """ SELECT i.namespace, i.identifier, i.extra, COALESCE(bID.biopolymer_id,bLabel.biopolymer_id,bName.biopolymer_id) AS biopolymer_id FROM (SELECT ?1 AS namespace, ?2 AS identifier, ?3 AS extra) AS i LEFT JOIN `db`.`biopolymer` AS bID @@ -2065,114 +2187,145 @@ def _lookupBiopolymerIDs(self, typeID, identifiers, minMatch, maxMatch, tally, e ON i.namespace NOT IN ('=','-') AND bName.biopolymer_id = bn.biopolymer_id AND ( ({0} IS NULL) OR (bName.type_id = {0}) ) -""".format(int(typeID) if typeID else "NULL") - - minMatch = int(minMatch) if (minMatch != None) else 0 - maxMatch = int(maxMatch) if (maxMatch != None) else None - tag = matches = None - n = numZero = numOne = numMany = 0 - with self._db: - for row in itertools.chain(self._db.cursor().executemany(sql, identifiers), [(None,None,None,None)]): - if tag != row[0:3]: - if tag: - if not matches: - numZero += 1 - elif len(matches) == 1: - numOne += 1 - else: - numMany += 1 - - if minMatch <= len(matches) <= (maxMatch if (maxMatch != None) else len(matches)): - for match in (matches or [tag+(None,)]): - yield match - elif errorCallback: - errorCallback("\t".join((t or "") for t in tag), "%s match%s at index %d" % ((len(matches) or "no"),("" if len(matches) == 1 else "es"),n)) - tag = row[0:3] - matches = set() - n += 1 - if row[3]: - matches.add(row) - #foreach row - if tally != None: - tally['zero'] = numZero - tally['one'] = numOne - tally['many'] = numMany - #_lookupBiopolymerIDs() - - - def generateBiopolymerIDsByIdentifiers(self, identifiers, minMatch=1, maxMatch=1, tally=None, errorCallback=None): - """ - Retrieve biopolymer IDs based on identifiers such as namespace and name. - - Parameters: - ----------- - identifiers : list of tuples - Each tuple contains (namespace, name, extra). - minMatch : int, optional - Minimum number of matches allowed (default is 1). - maxMatch : int, optional - Maximum number of matches allowed (default is 1). - tally : dict, optional - Dictionary to store match counts (default is None). - errorCallback : callable, optional - Function to handle errors. - - Returns: - -------- - Generator object yielding biopolymer IDs based on the given identifiers. - """ - # identifiers=[ (namespace,name,extra), ... ] - return self._lookupBiopolymerIDs(None, identifiers, minMatch, maxMatch, tally, errorCallback) - #generateBiopolymerIDsByIdentifiers() - - - def generateTypedBiopolymerIDsByIdentifiers(self, typeID, identifiers, minMatch=1, maxMatch=1, tally=None, errorCallback=None): - """ - Retrieve biopolymer IDs based on identifiers with a specific type. - - Parameters: - ----------- - typeID : int or None - Specific type ID for filtering. - identifiers : list of tuples - Each tuple contains (namespace, name, extra). - minMatch : int, optional - Minimum number of matches allowed (default is 1). - maxMatch : int, optional - Maximum number of matches allowed (default is 1). - tally : dict, optional - Dictionary to store match counts (default is None). - errorCallback : callable, optional - Function to handle errors. - - Returns: - -------- - Generator object yielding biopolymer IDs based on the given identifiers and type ID. - """ - # identifiers=[ (namespace,name,extra), ... ] - return self._lookupBiopolymerIDs(typeID, identifiers, minMatch, maxMatch, tally, errorCallback) - #generateTypedBiopolymerIDsByIdentifiers() - - - def _searchBiopolymerIDs(self, typeID, texts): - """ - Helper method to perform text-based search for biopolymer IDs. - - Parameters: - ----------- - typeID : int or None - Specific type ID for filtering. - texts : list of tuples - Each tuple contains (text, extra). - - Yields: - ------- - Tuples containing biopolymer IDs based on the given search criteria and type ID. - """ - # texts=[ (text,extra), ... ] - # yields (extra,label,id) - - sql = """ +""".format( + int(typeID) if typeID else "NULL" + ) + + minMatch = int(minMatch) if (minMatch != None) else 0 + maxMatch = int(maxMatch) if (maxMatch != None) else None + tag = matches = None + n = numZero = numOne = numMany = 0 + with self._db: + for row in itertools.chain( + self._db.cursor().executemany(sql, identifiers), + [(None, None, None, None)], + ): + if tag != row[0:3]: + if tag: + if not matches: + numZero += 1 + elif len(matches) == 1: + numOne += 1 + else: + numMany += 1 + + if ( + minMatch + <= len(matches) + <= (maxMatch if (maxMatch != None) else len(matches)) + ): + for match in matches or [tag + (None,)]: + yield match + elif errorCallback: + errorCallback( + "\t".join((t or "") for t in tag), + "%s match%s at index %d" + % ( + (len(matches) or "no"), + ("" if len(matches) == 1 else "es"), + n, + ), + ) + tag = row[0:3] + matches = set() + n += 1 + if row[3]: + matches.add(row) + # foreach row + if tally != None: + tally["zero"] = numZero + tally["one"] = numOne + tally["many"] = numMany + + # _lookupBiopolymerIDs() + + def generateBiopolymerIDsByIdentifiers( + self, identifiers, minMatch=1, maxMatch=1, tally=None, errorCallback=None + ): + """ + Retrieve biopolymer IDs based on identifiers such as namespace and name. + + Parameters: + ----------- + identifiers : list of tuples + Each tuple contains (namespace, name, extra). + minMatch : int, optional + Minimum number of matches allowed (default is 1). + maxMatch : int, optional + Maximum number of matches allowed (default is 1). + tally : dict, optional + Dictionary to store match counts (default is None). + errorCallback : callable, optional + Function to handle errors. + + Returns: + -------- + Generator object yielding biopolymer IDs based on the given identifiers. + """ + # identifiers=[ (namespace,name,extra), ... ] + return self._lookupBiopolymerIDs( + None, identifiers, minMatch, maxMatch, tally, errorCallback + ) + + # generateBiopolymerIDsByIdentifiers() + + def generateTypedBiopolymerIDsByIdentifiers( + self, + typeID, + identifiers, + minMatch=1, + maxMatch=1, + tally=None, + errorCallback=None, + ): + """ + Retrieve biopolymer IDs based on identifiers with a specific type. + + Parameters: + ----------- + typeID : int or None + Specific type ID for filtering. + identifiers : list of tuples + Each tuple contains (namespace, name, extra). + minMatch : int, optional + Minimum number of matches allowed (default is 1). + maxMatch : int, optional + Maximum number of matches allowed (default is 1). + tally : dict, optional + Dictionary to store match counts (default is None). + errorCallback : callable, optional + Function to handle errors. + + Returns: + -------- + Generator object yielding biopolymer IDs based on the given identifiers and type ID. + """ + # identifiers=[ (namespace,name,extra), ... ] + return self._lookupBiopolymerIDs( + typeID, identifiers, minMatch, maxMatch, tally, errorCallback + ) + + # generateTypedBiopolymerIDsByIdentifiers() + + def _searchBiopolymerIDs(self, typeID, texts): + """ + Helper method to perform text-based search for biopolymer IDs. + + Parameters: + ----------- + typeID : int or None + Specific type ID for filtering. + texts : list of tuples + Each tuple contains (text, extra). + + Yields: + ------- + Tuples containing biopolymer IDs based on the given search criteria and type ID. + """ + # texts=[ (text,extra), ... ] + # yields (extra,label,id) + + sql = """ SELECT ?2 AS extra, b.label, b.biopolymer_id FROM `db`.`biopolymer` AS b LEFT JOIN `db`.`biopolymer_name` AS bn USING (biopolymer_id) @@ -2183,79 +2336,82 @@ def _searchBiopolymerIDs(self, typeID, texts): OR bn.name LIKE '%'||?1||'%' ) """ - - if typeID: - sql += """ + + if typeID: + sql += ( + """ AND b.type_id = %d -""" % typeID - #if typeID - - sql += """ +""" + % typeID + ) + # if typeID + + sql += """ GROUP BY b.biopolymer_id """ - - return self._db.cursor().executemany(sql, texts) - #_searchBiopolymerIDs() - - - def generateBiopolymerIDsBySearch(self, searches): - """ - Retrieve biopolymer IDs based on a text-based search. - - Parameters: - ----------- - searches : list of tuples - Each tuple contains (text, extra). - - Returns: - -------- - Generator object yielding biopolymer IDs based on the given search criteria. - """ - # searches=[ (text,extra), ... ] - return self._searchBiopolymerIDs(None, searches) - #generateBiopolymerIDsBySearch() - - - def generateTypedBiopolymerIDsBySearch(self, typeID, searches): - """ - Retrieve biopolymer IDs based on a text-based search with a specific type. - - Parameters: - ----------- - typeID : int or None - Specific type ID for filtering. - searches : list of tuples - Each tuple contains (text, extra). - - Returns: - -------- - Generator object yielding biopolymer IDs based on the given search criteria and type ID. - """ - # searches=[ (text,extra), ... ] - return self._searchBiopolymerIDs(typeID, searches) - #generateTypedBiopolymerIDsBySearch() - - - def generateBiopolymerNameStats(self, namespaceID=None, typeID=None): - """ - Generate statistics on biopolymer names, including counts of unique and ambiguous names. - - Parameters: - ----------- - namespaceID : int or None, optional - Optional namespace ID filter. - typeID : int or None, optional - Optional type ID filter. - - Yields: - ------- - Tuples containing statistics for biopolymer names: - - `namespace`: Name of the namespace. - - `names`: Total number of names. - - `unique`: Number of unique names. - - `ambiguous`: Number of ambiguous names. - """ - sql = """ + + return self._db.cursor().executemany(sql, texts) + + # _searchBiopolymerIDs() + + def generateBiopolymerIDsBySearch(self, searches): + """ + Retrieve biopolymer IDs based on a text-based search. + + Parameters: + ----------- + searches : list of tuples + Each tuple contains (text, extra). + + Returns: + -------- + Generator object yielding biopolymer IDs based on the given search criteria. + """ + # searches=[ (text,extra), ... ] + return self._searchBiopolymerIDs(None, searches) + + # generateBiopolymerIDsBySearch() + + def generateTypedBiopolymerIDsBySearch(self, typeID, searches): + """ + Retrieve biopolymer IDs based on a text-based search with a specific type. + + Parameters: + ----------- + typeID : int or None + Specific type ID for filtering. + searches : list of tuples + Each tuple contains (text, extra). + + Returns: + -------- + Generator object yielding biopolymer IDs based on the given search criteria and type ID. + """ + # searches=[ (text,extra), ... ] + return self._searchBiopolymerIDs(typeID, searches) + + # generateTypedBiopolymerIDsBySearch() + + def generateBiopolymerNameStats(self, namespaceID=None, typeID=None): + """ + Generate statistics on biopolymer names, including counts of unique and ambiguous names. + + Parameters: + ----------- + namespaceID : int or None, optional + Optional namespace ID filter. + typeID : int or None, optional + Optional type ID filter. + + Yields: + ------- + Tuples containing statistics for biopolymer names: + - `namespace`: Name of the namespace. + - `names`: Total number of names. + - `unique`: Number of unique names. + - `ambiguous`: Number of ambiguous names. + """ + sql = """ SELECT `namespace`, COUNT() AS `names`, @@ -2265,88 +2421,95 @@ def generateBiopolymerNameStats(self, namespaceID=None, typeID=None): SELECT bn.namespace_id, bn.name, COUNT(DISTINCT bn.biopolymer_id) AS matches FROM `db`.`biopolymer_name` AS bn """ - - if typeID: - sql += """ + + if typeID: + sql += ( + """ JOIN `db`.`biopolymer` AS b ON b.biopolymer_id = bn.biopolymer_id AND b.type_id = %d -""" % typeID - - if namespaceID: - sql += """ +""" + % typeID + ) + + if namespaceID: + sql += ( + """ WHERE bn.namespace_id = %d -""" % namespaceID - - sql += """ +""" + % namespaceID + ) + + sql += """ GROUP BY bn.namespace_id, bn.name ) JOIN `db`.`namespace` AS n USING (namespace_id) GROUP BY namespace_id """ - - for row in self._db.cursor().execute(sql): - yield row - #generateBiopolymerNameStats() - - - ################################################## - # group data retrieval - - - def generateGroupsByIDs(self, ids): - """ - Retrieve groups based on provided group IDs. - - Parameters: - ----------- - ids : list of tuples - Each tuple contains (group_id, extra). - - Yields: - ------- - Tuples containing group information: - (group_id, extra, type_id, subtype_id, label, description) - """ - # ids=[ (id,extra), ... ] - # yield:[ (id,extra,type_id,subtype_id,label,description), ... ] - sql = "SELECT group_id, ?2 AS extra, type_id, subtype_id, label, description FROM `db`.`group` WHERE group_id = ?1" - return self._db.cursor().executemany(sql, ids) - #generateGroupsByIDs() - - - def _lookupGroupIDs(self, typeID, identifiers, minMatch, maxMatch, tally, errorCallback): - """ - Helper method to look up group IDs based on identifiers. - - Parameters: - ----------- - typeID : int or None - Specific type ID for filtering. - identifiers : list of tuples - Each tuple contains (namespace, name, extra). - minMatch : int or None - Minimum number of matches allowed. - maxMatch : int or None - Maximum number of matches allowed. - tally : dict or None - Dictionary to store match counts. - errorCallback : callable or None - Function to handle errors. - - Yields: - ------- - Tuples containing (namespace, name, extra, group_id). - """ - # typeID=int or Falseish for any - # identifiers=[ (namespace,name,extra), ... ] - # namespace='' or '*' for any, '-' for labels, '=' for group_id - # minMatch=int or Falseish for none - # maxMatch=int or Falseish for none - # tally=dict() or None - # errorCallback=callable(input,error) - # yields (namespace,name,extra,id) - - sql = """ + + for row in self._db.cursor().execute(sql): + yield row + + # generateBiopolymerNameStats() + + ################################################## + # group data retrieval + + def generateGroupsByIDs(self, ids): + """ + Retrieve groups based on provided group IDs. + + Parameters: + ----------- + ids : list of tuples + Each tuple contains (group_id, extra). + + Yields: + ------- + Tuples containing group information: + (group_id, extra, type_id, subtype_id, label, description) + """ + # ids=[ (id,extra), ... ] + # yield:[ (id,extra,type_id,subtype_id,label,description), ... ] + sql = "SELECT group_id, ?2 AS extra, type_id, subtype_id, label, description FROM `db`.`group` WHERE group_id = ?1" + return self._db.cursor().executemany(sql, ids) + + # generateGroupsByIDs() + + def _lookupGroupIDs( + self, typeID, identifiers, minMatch, maxMatch, tally, errorCallback + ): + """ + Helper method to look up group IDs based on identifiers. + + Parameters: + ----------- + typeID : int or None + Specific type ID for filtering. + identifiers : list of tuples + Each tuple contains (namespace, name, extra). + minMatch : int or None + Minimum number of matches allowed. + maxMatch : int or None + Maximum number of matches allowed. + tally : dict or None + Dictionary to store match counts. + errorCallback : callable or None + Function to handle errors. + + Yields: + ------- + Tuples containing (namespace, name, extra, group_id). + """ + # typeID=int or Falseish for any + # identifiers=[ (namespace,name,extra), ... ] + # namespace='' or '*' for any, '-' for labels, '=' for group_id + # minMatch=int or Falseish for none + # maxMatch=int or Falseish for none + # tally=dict() or None + # errorCallback=callable(input,error) + # yields (namespace,name,extra,id) + + sql = """ SELECT i.namespace, i.identifier, i.extra, COALESCE(gID.group_id,gLabel.group_id,gName.group_id) AS group_id FROM (SELECT ?1 AS namespace, ?2 AS identifier, ?3 AS extra) AS i LEFT JOIN `db`.`group` AS gID @@ -2368,115 +2531,146 @@ def _lookupGroupIDs(self, typeID, identifiers, minMatch, maxMatch, tally, errorC ON i.namespace NOT IN ('=','-') AND gName.group_id = gn.group_id AND ( ({0} IS NULL) OR (gName.type_id = {0}) ) -""".format(int(typeID) if typeID else "NULL") - - minMatch = int(minMatch) if (minMatch != None) else 0 - maxMatch = int(maxMatch) if (maxMatch != None) else None - tag = matches = None - n = numZero = numOne = numMany = 0 - with self._db: - for row in itertools.chain(self._db.cursor().executemany(sql, identifiers), [(None,None,None,None)]): - if tag != row[0:3]: - if tag: - if not matches: - numZero += 1 - elif len(matches) == 1: - numOne += 1 - else: - numMany += 1 - - if minMatch <= len(matches) <= (maxMatch if (maxMatch != None) else len(matches)): - for match in (matches or [tag+(None,)]): - yield match - elif errorCallback: - errorCallback("\t".join((t or "") for t in tag), "%s match%s at index %d" % ((len(matches) or "no"),("" if len(matches) == 1 else "es"),n)) - tag = row[0:3] - matches = set() - n += 1 - if row[3]: - matches.add(row) - #foreach row - if tally != None: - tally['zero'] = numZero - tally['one'] = numOne - tally['many'] = numMany - #_lookupGroupIDs() - - - def generateGroupIDsByIdentifiers(self, identifiers, minMatch=1, maxMatch=1, tally=None, errorCallback=None): - """ - Generate group IDs based on identifiers such as namespace and name. - - Parameters: - ----------- - identifiers : list of tuples - Each tuple contains (namespace, name, extra). - minMatch : int, optional - Minimum number of matches allowed (default is 1). - maxMatch : int, optional - Maximum number of matches allowed (default is 1). - tally : dict, optional - Dictionary to store match counts (default is None). - errorCallback : callable, optional - Function to handle errors. - - Yields: - ------- - Tuples containing (namespace, name, extra, group_id). - """ - # identifiers=[ (namespace,name,extra), ... ] - return self._lookupGroupIDs(None, identifiers, minMatch, maxMatch, tally, errorCallback) - #generateGroupIDsByIdentifiers() - - - def generateTypedGroupIDsByIdentifiers(self, typeID, identifiers, minMatch=1, maxMatch=1, tally=None, errorCallback=None): - """ - Generate group IDs based on identifiers with a specific type. - - Parameters: - ----------- - typeID : int - Specific type ID for filtering. - identifiers : list of tuples - Each tuple contains (namespace, name, extra). - minMatch : int, optional - Minimum number of matches allowed (default is 1). - maxMatch : int, optional - Maximum number of matches allowed (default is 1). - tally : dict, optional - Dictionary to store match counts (default is None). - errorCallback : callable, optional - Function to handle errors. - - Yields: - ------- - Tuples containing (namespace, name, extra, group_id). - """ - - # identifiers=[ (namespace,name,extra), ... ] - return self._lookupGroupIDs(typeID, identifiers, minMatch, maxMatch, tally, errorCallback) - #generateTypedGroupIDsByIdentifiers() - - - def _searchGroupIDs(self, typeID, texts): - """ - Helper method to perform text-based search for group IDs. - - Parameters: - ----------- - typeID : int or None - Specific type ID for filtering. - texts : list of tuples - Each tuple contains (text, extra). - - Yields: - ------- - Tuples containing group IDs based on the given search criteria and type ID. - """ - # texts=[ (text,extra), ... ] - # yields (extra,label,id) - - sql = """ +""".format( + int(typeID) if typeID else "NULL" + ) + + minMatch = int(minMatch) if (minMatch != None) else 0 + maxMatch = int(maxMatch) if (maxMatch != None) else None + tag = matches = None + n = numZero = numOne = numMany = 0 + with self._db: + for row in itertools.chain( + self._db.cursor().executemany(sql, identifiers), + [(None, None, None, None)], + ): + if tag != row[0:3]: + if tag: + if not matches: + numZero += 1 + elif len(matches) == 1: + numOne += 1 + else: + numMany += 1 + + if ( + minMatch + <= len(matches) + <= (maxMatch if (maxMatch != None) else len(matches)) + ): + for match in matches or [tag + (None,)]: + yield match + elif errorCallback: + errorCallback( + "\t".join((t or "") for t in tag), + "%s match%s at index %d" + % ( + (len(matches) or "no"), + ("" if len(matches) == 1 else "es"), + n, + ), + ) + tag = row[0:3] + matches = set() + n += 1 + if row[3]: + matches.add(row) + # foreach row + if tally != None: + tally["zero"] = numZero + tally["one"] = numOne + tally["many"] = numMany + + # _lookupGroupIDs() + + def generateGroupIDsByIdentifiers( + self, identifiers, minMatch=1, maxMatch=1, tally=None, errorCallback=None + ): + """ + Generate group IDs based on identifiers such as namespace and name. + + Parameters: + ----------- + identifiers : list of tuples + Each tuple contains (namespace, name, extra). + minMatch : int, optional + Minimum number of matches allowed (default is 1). + maxMatch : int, optional + Maximum number of matches allowed (default is 1). + tally : dict, optional + Dictionary to store match counts (default is None). + errorCallback : callable, optional + Function to handle errors. + + Yields: + ------- + Tuples containing (namespace, name, extra, group_id). + """ + # identifiers=[ (namespace,name,extra), ... ] + return self._lookupGroupIDs( + None, identifiers, minMatch, maxMatch, tally, errorCallback + ) + + # generateGroupIDsByIdentifiers() + + def generateTypedGroupIDsByIdentifiers( + self, + typeID, + identifiers, + minMatch=1, + maxMatch=1, + tally=None, + errorCallback=None, + ): + """ + Generate group IDs based on identifiers with a specific type. + + Parameters: + ----------- + typeID : int + Specific type ID for filtering. + identifiers : list of tuples + Each tuple contains (namespace, name, extra). + minMatch : int, optional + Minimum number of matches allowed (default is 1). + maxMatch : int, optional + Maximum number of matches allowed (default is 1). + tally : dict, optional + Dictionary to store match counts (default is None). + errorCallback : callable, optional + Function to handle errors. + + Yields: + ------- + Tuples containing (namespace, name, extra, group_id). + """ + + # identifiers=[ (namespace,name,extra), ... ] + return self._lookupGroupIDs( + typeID, identifiers, minMatch, maxMatch, tally, errorCallback + ) + + # generateTypedGroupIDsByIdentifiers() + + def _searchGroupIDs(self, typeID, texts): + """ + Helper method to perform text-based search for group IDs. + + Parameters: + ----------- + typeID : int or None + Specific type ID for filtering. + texts : list of tuples + Each tuple contains (text, extra). + + Yields: + ------- + Tuples containing group IDs based on the given search criteria and type ID. + """ + # texts=[ (text,extra), ... ] + # yields (extra,label,id) + + sql = """ SELECT ?2 AS extra, g.label, g.group_id FROM `db`.`group` AS g LEFT JOIN `db`.`group_name` AS gn USING (group_id) @@ -2487,78 +2681,81 @@ def _searchGroupIDs(self, typeID, texts): OR gn.name LIKE '%'||?1||'%' ) """ - - if typeID: - sql += """ + + if typeID: + sql += ( + """ AND g.type_id = %d -""" % typeID - #if typeID - - sql += """ +""" + % typeID + ) + # if typeID + + sql += """ GROUP BY g.group_id """ - - return self._db.cursor().executemany(sql, texts) - #_searchGroupIDs() - - - def generateGroupIDsBySearch(self, searches): - """ - Retrieve group IDs based on a text-based search. - - Parameters: - ----------- - searches : list of tuples - Each tuple contains (text, extra). - - Yields: - ------- - Tuples containing group IDs based on the given search criteria. - (extra, label, group_id) - """ - # searches=[ (text,extra), ... ] - return self._searchGroupIDs(None, searches) - #generateGroupIDsBySearch() - - - def generateTypedGroupIDsBySearch(self, typeID, searches): - """ - Retrieve group IDs based on a text-based search with a specific type. - - Parameters: - ----------- - typeID : int - Specific type ID for filtering. - searches : list of tuples - Each tuple contains (text, extra). - - Yields: - ------- - Tuples containing group IDs based on the given search criteria and type ID. - (extra, label, group_id) - """ - # searches=[ (text,extra), ... ] - return self._searchGroupIDs(typeID, searches) - #generateTypedGroupIDsBySearch() - - - def generateGroupNameStats(self, namespaceID=None, typeID=None): - """ - Generate statistics on group names. - - Parameters: - ----------- - namespaceID : int or None, optional - Namespace ID for filtering (default is None). - typeID : int or None, optional - Specific type ID for filtering (default is None). - - Yields: - ------- - Tuples containing statistics on group names: - (namespace, names, unique, ambiguous) - """ - sql = """ + + return self._db.cursor().executemany(sql, texts) + + # _searchGroupIDs() + + def generateGroupIDsBySearch(self, searches): + """ + Retrieve group IDs based on a text-based search. + + Parameters: + ----------- + searches : list of tuples + Each tuple contains (text, extra). + + Yields: + ------- + Tuples containing group IDs based on the given search criteria. + (extra, label, group_id) + """ + # searches=[ (text,extra), ... ] + return self._searchGroupIDs(None, searches) + + # generateGroupIDsBySearch() + + def generateTypedGroupIDsBySearch(self, typeID, searches): + """ + Retrieve group IDs based on a text-based search with a specific type. + + Parameters: + ----------- + typeID : int + Specific type ID for filtering. + searches : list of tuples + Each tuple contains (text, extra). + + Yields: + ------- + Tuples containing group IDs based on the given search criteria and type ID. + (extra, label, group_id) + """ + # searches=[ (text,extra), ... ] + return self._searchGroupIDs(typeID, searches) + + # generateTypedGroupIDsBySearch() + + def generateGroupNameStats(self, namespaceID=None, typeID=None): + """ + Generate statistics on group names. + + Parameters: + ----------- + namespaceID : int or None, optional + Namespace ID for filtering (default is None). + typeID : int or None, optional + Specific type ID for filtering (default is None). + + Yields: + ------- + Tuples containing statistics on group names: + (namespace, names, unique, ambiguous) + """ + sql = """ SELECT `namespace`, COUNT() AS `names`, @@ -2568,87 +2765,92 @@ def generateGroupNameStats(self, namespaceID=None, typeID=None): SELECT gn.namespace_id, gn.name, COUNT(DISTINCT gn.group_id) AS matches FROM `db`.`group_name` AS gn """ - - if typeID: - sql += """ + + if typeID: + sql += ( + """ JOIN `db`.`group` AS g ON g.group_id = gn.group_id AND g.type_id = %d -""" % typeID - - if namespaceID: - sql += """ +""" + % typeID + ) + + if namespaceID: + sql += ( + """ WHERE gn.namespace_id = %d -""" % namespaceID - - sql += """ +""" + % namespaceID + ) + + sql += """ GROUP BY gn.namespace_id, gn.name ) JOIN `db`.`namespace` AS n USING (namespace_id) GROUP BY namespace_id """ - - for row in self._db.cursor().execute(sql): - yield row - #generateGroupNameStats() - - - ################################################## - # liftover - # - # originally from UCSC - # reimplemented? in C++ for Biofilter 1.0 by Eric Torstenson - # reimplemented again in Python by John Wallace - - - def hasLiftOverChains(self, oldHG, newHG): - """ - Check if there are liftOver chains between old and new genome assemblies. - - Parameters: - ----------- - oldHG : str - Old genome assembly identifier. - newHG : str - New genome assembly identifier. - - Returns: - -------- - int - Number of liftOver chains found between old and new genome assemblies. - """ - sql = "SELECT COUNT() FROM `db`.`chain` WHERE old_ucschg = ? AND new_ucschg = ?" - return max(row[0] for row in self._db.cursor().execute(sql, (oldHG, newHG))) - #hasLiftOverChains() - - - def _generateApplicableLiftOverChains(self, oldHG, newHG, chrom, start, end): - """ - Generate applicable liftOver chains for a specific region. - - Parameters: - ----------- - oldHG : str - Old genome assembly identifier. - newHG : str - New genome assembly identifier. - chrom : str - Chromosome name. - start : int - Start position of the region. - end : int - End position of the region. - - Yields: - ------- - Tuples containing liftOver chain information for the given region. - (chain_id, old_chr, score, old_start, old_end, new_start, is_fwd, new_chr, old_start, old_end, new_start) - """ - conv = (oldHG,newHG) - if conv in self._liftOverCache: - chains = self._liftOverCache[conv] - else: - chains = {'data':{}, 'keys':{}} - sql = """ + + for row in self._db.cursor().execute(sql): + yield row + + # generateGroupNameStats() + + ################################################## + # liftover + # + # originally from UCSC + # reimplemented? in C++ for Biofilter 1.0 by Eric Torstenson + # reimplemented again in Python by John Wallace + + def hasLiftOverChains(self, oldHG, newHG): + """ + Check if there are liftOver chains between old and new genome assemblies. + + Parameters: + ----------- + oldHG : str + Old genome assembly identifier. + newHG : str + New genome assembly identifier. + + Returns: + -------- + int + Number of liftOver chains found between old and new genome assemblies. + """ + sql = "SELECT COUNT() FROM `db`.`chain` WHERE old_ucschg = ? AND new_ucschg = ?" + return max(row[0] for row in self._db.cursor().execute(sql, (oldHG, newHG))) + + # hasLiftOverChains() + + def _generateApplicableLiftOverChains(self, oldHG, newHG, chrom, start, end): + """ + Generate applicable liftOver chains for a specific region. + + Parameters: + ----------- + oldHG : str + Old genome assembly identifier. + newHG : str + New genome assembly identifier. + chrom : str + Chromosome name. + start : int + Start position of the region. + end : int + End position of the region. + + Yields: + ------- + Tuples containing liftOver chain information for the given region. + (chain_id, old_chr, score, old_start, old_end, new_start, is_fwd, new_chr, old_start, old_end, new_start) + """ + conv = (oldHG, newHG) + if conv in self._liftOverCache: + chains = self._liftOverCache[conv] + else: + chains = {"data": {}, "keys": {}} + sql = """ SELECT chain_id, c.old_chr, c.score, c.old_start, c.old_end, c.new_start, c.is_fwd, c.new_chr, cd.old_start, cd.old_end, cd.new_start @@ -2657,209 +2859,235 @@ def _generateApplicableLiftOverChains(self, oldHG, newHG, chrom, start, end): WHERE c.old_ucschg=? AND c.new_ucschg=? ORDER BY c.old_chr, score DESC, cd.old_start """ - for row in self._db.cursor().execute(sql, conv): - chain = (row[2], row[3], row[4], row[5], row[6], row[7], row[0]) - chr = row[1] - - if chr not in chains['data']: - chains['data'][chr] = {chain: []} - chains['keys'][chr] = [chain] - elif chain not in chains['data'][chr]: - chains['data'][chr][chain] = [] - chains['keys'][chr].append(chain) - - chains['data'][chr][chain].append( (row[8],row[9],row[10]) ) - #foreach row - - # Sort the chains by score - for k in chains['keys']: - chains['keys'][k].sort(reverse=True) - - self._liftOverCache[conv] = chains - #if chains are cached - - for c in chains['keys'].get(chrom, []): - # if the region overlaps the chain... (1-based, closed intervals) - if start <= c[2] and end >= c[1]: - data = chains['data'][chrom][c] - idx = bisect.bisect(data, (start, sys.maxsize, sys.maxsize)) - 1 - while (idx < 0) or (data[idx][1] < start): - idx = idx + 1 - while (idx < len(data)) and (data[idx][0] <= end): - yield (c[-1], data[idx][0], data[idx][1], data[idx][2], c[4], c[5]) - idx = idx + 1 - #foreach chain - #_generateApplicableLiftOverChains() - - - def _liftOverRegionUsingChains(self, label, start, end, extra, first_seg, end_seg, total_mapped_sz): - """ - Map a region given the 1st and last segment as well as the total mapped size. - - Parameters: - ----------- - label : str - Label of the region. - start : int - Start position of the region. - end : int - End position of the region. - extra : object - Additional data associated with the region. - first_seg : tuple - First segment information. - end_seg : tuple - Last segment information. - total_mapped_sz : int - Total mapped size of the region. - - Returns: - -------- - tuple or None - Mapped region information if mapped successfully, otherwise None. - """ - mapped_reg = None - - # The front and end differences are the distances from the - # beginning of the segment. - - # The front difference should be >= 0 and <= size of 1st segment - front_diff = max(0, min(start - first_seg[1], first_seg[2] - first_seg[1])) - - # The end difference should be similar, but w/ last - end_diff = max(0, min(end - end_seg[1], end_seg[2] - end_seg[1])) - - # Now, if we are moving forward, we add the difference - # to the new_start, backward, we subtract - # Also, at this point, if backward, swap start/end - if first_seg[4]: - new_start = first_seg[3] + front_diff - new_end = end_seg[3] + end_diff - else: - new_start = end_seg[3] - end_diff - new_end = first_seg[3] - front_diff - - # old_startHere, detect if we have mapped a sufficient fraction - # of the region. liftOver uses a default of 95% - mapped_size = total_mapped_sz - front_diff - (end_seg[2] - end_seg[1] + 1) + end_diff + 1 - - if mapped_size / float(end - start + 1) >= 0.95: # TODO: configurable threshold? - mapped_reg = (label, first_seg[5], new_start, new_end, extra) - - return mapped_reg - #_liftOverRegionUsingChains() - - - def generateLiftOverRegions(self, oldHG, newHG, regions, tally=None, errorCallback=None): - """ - Generate liftOver regions based on old and new genome assemblies. - - Parameters: - ----------- - oldHG : str - Old genome assembly identifier. - newHG : str - New genome assembly identifier. - regions : iterable - Iterable of regions to be lifted over, where each region is represented as a tuple - (label, chr, posMin, posMax, extra). - tally : dict or None, optional - A dictionary to store the count of lifted and non-lifted regions (default is None). - errorCallback : function or None, optional - A callback function to handle errors for non-liftable regions (default is None). - - Yields: - ------- - tuple - Mapped regions in the format (label, chrom, new_start, new_end, extra). - """ - # regions=[ (label,chr,posMin,posMax,extra), ... ] - oldHG = int(oldHG) - newHG = int(newHG) - numNull = numLift = 0 - for region in regions: - label,chrom,start,end,extra = region - - if start > end: - start,end = end,start - is_region = (start != end) - - # find and apply chains - mapped_reg = None - curr_chain = None - total_mapped_sz = 0 - first_seg = None - end_seg = None - for seg in self._generateApplicableLiftOverChains(oldHG, newHG, chrom, start, end): - if curr_chain is None: - curr_chain = seg[0] - first_seg = seg - end_seg = seg - total_mapped_sz = seg[2] - seg[1] + 1 - elif seg[0] != curr_chain: - mapped_reg = self._liftOverRegionUsingChains(label, start, end, extra, first_seg, end_seg, total_mapped_sz) - if mapped_reg: - break - curr_chain = seg[0] - first_seg = seg - end_seg = seg - total_mapped_sz = seg[2] - seg[1] + 1 - else: - end_seg = seg - total_mapped_sz = total_mapped_sz + seg[2] - seg[1] + 1 - - if not mapped_reg and first_seg is not None: - mapped_reg = self._liftOverRegionUsingChains(label, start, end, extra, first_seg, end_seg, total_mapped_sz) - - if mapped_reg: - numLift += 1 - if not is_region: - mapped_reg = (mapped_reg[0], mapped_reg[1], mapped_reg[2], mapped_reg[2], extra) - yield mapped_reg - else: - numNull += 1 - if errorCallback: - errorCallback(region) - #foreach region - - if tally != None: - tally['null'] = numNull - tally['lift'] = numLift - #generateLiftOverRegions() - - - def generateLiftOverLoci(self, oldHG, newHG, loci, tally=None, errorCallback=None): - """ - Generate liftOver loci based on old and new genome assemblies. - - Parameters: - ----------- - oldHG : str - Old genome assembly identifier. - newHG : str - New genome assembly identifier. - loci : iterable - Iterable of loci to be lifted over, where each locus is represented as a tuple - (label, chr, pos, extra). - tally : dict or None, optional - A dictionary to store the count of lifted and non-lifted loci (default is None). - errorCallback : function or None, optional - A callback function to handle errors for non-liftable loci (default is None). - - Returns: - -------- - iterable - Yields new loci in the format (label, chrom, new_pos, extra) for each successfully - lifted locus. - """ - # loci=[ (label,chr,pos,extra), ... ] - regions = ((l[0],l[1],l[2],l[2],l[3]) for l in loci) - newloci = ((r[0],r[1],r[2],r[4]) for r in self.generateLiftOverRegions(oldHG, newHG, regions, tally, errorCallback)) - return newloci - #generateLiftOverLoci() - - -#Database + for row in self._db.cursor().execute(sql, conv): + chain = (row[2], row[3], row[4], row[5], row[6], row[7], row[0]) + chr = row[1] + + if chr not in chains["data"]: + chains["data"][chr] = {chain: []} + chains["keys"][chr] = [chain] + elif chain not in chains["data"][chr]: + chains["data"][chr][chain] = [] + chains["keys"][chr].append(chain) + + chains["data"][chr][chain].append((row[8], row[9], row[10])) + # foreach row + + # Sort the chains by score + for k in chains["keys"]: + chains["keys"][k].sort(reverse=True) + + self._liftOverCache[conv] = chains + # if chains are cached + + for c in chains["keys"].get(chrom, []): + # if the region overlaps the chain... (1-based, closed intervals) + if start <= c[2] and end >= c[1]: + data = chains["data"][chrom][c] + idx = bisect.bisect(data, (start, sys.maxsize, sys.maxsize)) - 1 + while (idx < 0) or (data[idx][1] < start): + idx = idx + 1 + while (idx < len(data)) and (data[idx][0] <= end): + yield (c[-1], data[idx][0], data[idx][1], data[idx][2], c[4], c[5]) + idx = idx + 1 + # foreach chain + + # _generateApplicableLiftOverChains() + + def _liftOverRegionUsingChains( + self, label, start, end, extra, first_seg, end_seg, total_mapped_sz + ): + """ + Map a region given the 1st and last segment as well as the total mapped size. + + Parameters: + ----------- + label : str + Label of the region. + start : int + Start position of the region. + end : int + End position of the region. + extra : object + Additional data associated with the region. + first_seg : tuple + First segment information. + end_seg : tuple + Last segment information. + total_mapped_sz : int + Total mapped size of the region. + + Returns: + -------- + tuple or None + Mapped region information if mapped successfully, otherwise None. + """ + mapped_reg = None + + # The front and end differences are the distances from the + # beginning of the segment. + + # The front difference should be >= 0 and <= size of 1st segment + front_diff = max(0, min(start - first_seg[1], first_seg[2] - first_seg[1])) + + # The end difference should be similar, but w/ last + end_diff = max(0, min(end - end_seg[1], end_seg[2] - end_seg[1])) + + # Now, if we are moving forward, we add the difference + # to the new_start, backward, we subtract + # Also, at this point, if backward, swap start/end + if first_seg[4]: + new_start = first_seg[3] + front_diff + new_end = end_seg[3] + end_diff + else: + new_start = end_seg[3] - end_diff + new_end = first_seg[3] - front_diff + + # old_startHere, detect if we have mapped a sufficient fraction + # of the region. liftOver uses a default of 95% + mapped_size = ( + total_mapped_sz - front_diff - (end_seg[2] - end_seg[1] + 1) + end_diff + 1 + ) + + if ( + mapped_size / float(end - start + 1) >= 0.95 + ): # TODO: configurable threshold? + mapped_reg = (label, first_seg[5], new_start, new_end, extra) + + return mapped_reg + + # _liftOverRegionUsingChains() + + def generateLiftOverRegions( + self, oldHG, newHG, regions, tally=None, errorCallback=None + ): + """ + Generate liftOver regions based on old and new genome assemblies. + + Parameters: + ----------- + oldHG : str + Old genome assembly identifier. + newHG : str + New genome assembly identifier. + regions : iterable + Iterable of regions to be lifted over, where each region is represented as a tuple + (label, chr, posMin, posMax, extra). + tally : dict or None, optional + A dictionary to store the count of lifted and non-lifted regions (default is None). + errorCallback : function or None, optional + A callback function to handle errors for non-liftable regions (default is None). + + Yields: + ------- + tuple + Mapped regions in the format (label, chrom, new_start, new_end, extra). + """ + # regions=[ (label,chr,posMin,posMax,extra), ... ] + oldHG = int(oldHG) + newHG = int(newHG) + numNull = numLift = 0 + for region in regions: + label, chrom, start, end, extra = region + + if start > end: + start, end = end, start + is_region = start != end + + # find and apply chains + mapped_reg = None + curr_chain = None + total_mapped_sz = 0 + first_seg = None + end_seg = None + for seg in self._generateApplicableLiftOverChains( + oldHG, newHG, chrom, start, end + ): + if curr_chain is None: + curr_chain = seg[0] + first_seg = seg + end_seg = seg + total_mapped_sz = seg[2] - seg[1] + 1 + elif seg[0] != curr_chain: + mapped_reg = self._liftOverRegionUsingChains( + label, start, end, extra, first_seg, end_seg, total_mapped_sz + ) + if mapped_reg: + break + curr_chain = seg[0] + first_seg = seg + end_seg = seg + total_mapped_sz = seg[2] - seg[1] + 1 + else: + end_seg = seg + total_mapped_sz = total_mapped_sz + seg[2] - seg[1] + 1 + + if not mapped_reg and first_seg is not None: + mapped_reg = self._liftOverRegionUsingChains( + label, start, end, extra, first_seg, end_seg, total_mapped_sz + ) + + if mapped_reg: + numLift += 1 + if not is_region: + mapped_reg = ( + mapped_reg[0], + mapped_reg[1], + mapped_reg[2], + mapped_reg[2], + extra, + ) + yield mapped_reg + else: + numNull += 1 + if errorCallback: + errorCallback(region) + # foreach region + + if tally != None: + tally["null"] = numNull + tally["lift"] = numLift + + # generateLiftOverRegions() + + def generateLiftOverLoci(self, oldHG, newHG, loci, tally=None, errorCallback=None): + """ + Generate liftOver loci based on old and new genome assemblies. + + Parameters: + ----------- + oldHG : str + Old genome assembly identifier. + newHG : str + New genome assembly identifier. + loci : iterable + Iterable of loci to be lifted over, where each locus is represented as a tuple + (label, chr, pos, extra). + tally : dict or None, optional + A dictionary to store the count of lifted and non-lifted loci (default is None). + errorCallback : function or None, optional + A callback function to handle errors for non-liftable loci (default is None). + + Returns: + -------- + iterable + Yields new loci in the format (label, chrom, new_pos, extra) for each successfully + lifted locus. + """ + # loci=[ (label,chr,pos,extra), ... ] + regions = ((l[0], l[1], l[2], l[2], l[3]) for l in loci) + newloci = ( + (r[0], r[1], r[2], r[4]) + for r in self.generateLiftOverRegions( + oldHG, newHG, regions, tally, errorCallback + ) + ) + return newloci + + # generateLiftOverLoci() + + +# Database # TODO: find a better place for this liftover testing code diff --git a/loki/loki_source.py b/loki/loki_source.py index fd3b817..66fb942 100644 --- a/loki/loki_source.py +++ b/loki/loki_source.py @@ -15,939 +15,1113 @@ class Source(object): - - - ################################################## - # constructor - - - def __init__(self, lokidb): - assert(isinstance(lokidb, loki_db.Database)) - assert(self.__class__.__name__.startswith('Source_')) - self._loki = lokidb - self._db = lokidb._db - self._sourceID = self.addSource(self.getSourceName()) - assert(self._sourceID > 0) - #__init__() - - - ################################################## - # source interface - - - @classmethod - def getVersionString(cls): - # when checked out from SVN, these $-delimited strings are magically kept updated - rev = '$Revision$'.split() - date = '$Date$'.split() - stat = None - - if len(rev) > 2: - version = 'r%s' % rev[1:2] - else: - stat = stat or os.stat(sys.modules[cls.__module__].__file__) # type: ignore - version = '%s' % (stat.st_size,) - - if len(date) > 3: - version += ' (%s %s)' % date[1:3] - else: - stat = stat or os.stat(sys.modules[cls.__module__].__file__) # type: ignore - version += datetime.datetime.utcfromtimestamp(stat.st_mtime).strftime(' (%Y-%m-%d)' if (len(rev) > 2) else ' (%Y-%m-%d %H:%M:%S)') - - return version - #getVersionString() - - - @classmethod - def getOptions(cls): - return None - #getOptions() - - - def validateOptions(self, options): - for o in options: - return "unexpected option '%s'" % o - return True - #validateOptions() - - - def download(self, options): - raise Exception("invalid LOKI Source plugin: download() not implemented") - #download() - - - def update(self, options): - raise Exception("invalid LOKI Source plugin: update() not implemented") - #update() - - - ################################################## - # context manager - - - def __enter__(self): - return self._loki.__enter__() - #__enter__() - - - def __exit__(self, excType, excVal, traceback): - return self._loki.__exit__(excType, excVal, traceback) - #__exit__() - - - ################################################## - # logging - - - def log(self, message=""): - return self._loki.log(message) - #log() - - - def logPush(self, message=None): - return self._loki.logPush(message) - #logPush() - - - def logPop(self, message=None): - return self._loki.logPop(message) - #logPop() - - - ################################################## - # database update - - - def prepareTableForUpdate(self, table): - return self._loki.prepareTableForUpdate(table) - #prepareTableUpdate() - - - def prepareTableForQuery(self, table): - return self._loki.prepareTableForQuery(table) - #prepareTableQuery() - - - ################################################## - # metadata management - - - def addLDProfile(self, ldprofile, description=None, metric=None, value=None): - return self.addLDProfiles([(ldprofile,description,metric,value)])[ldprofile] - #addLDProfile() - - - def addLDProfiles(self, ldprofiles): - # ldprofiles=[ (ldprofile,description,metric,value), ... ] - dbc = self._db.cursor() - ret = {} - # use ABORT to avoid wasting autoincrements on existing rows, - # and execute() to avoid bailing out of executemany() due to ABORT - for ld in ldprofiles: - try: - dbc.execute("INSERT OR ABORT INTO `db`.`ldprofile` (ldprofile,description,metric,value) VALUES (LOWER(?),?,LOWER(?),?); SELECT LAST_INSERT_ROWID()", ld) - except apsw.ConstraintError: - dbc.execute("SELECT ldprofile_id FROM `db`.`ldprofile` WHERE ldprofile = LOWER(?)", ld[0:1]) - for row in dbc: - ret[ld[0]] = row[0] - return ret - #addLDProfiles() - - - def addNamespace(self, namespace, polygenic=0): - return self.addNamespaces([(namespace,polygenic)])[namespace] - #addNamespace() - - - def addNamespaces(self, namespaces): - # namespaces=[ (namespace,polygenic), ... ] - dbc = self._db.cursor() - ret = {} - # use ABORT to avoid wasting autoincrements on existing rows, - # and execute() to avoid bailing out of executemany() due to ABORT - for n in namespaces: - try: - dbc.execute("INSERT OR ABORT INTO `db`.`namespace` (namespace,polygenic) VALUES (LOWER(?),?); SELECT LAST_INSERT_ROWID()", n) - except apsw.ConstraintError: - dbc.execute("SELECT namespace_id FROM `db`.`namespace` WHERE namespace = LOWER(?)", n[0:1]) - for row in dbc: - ret[n[0]] = row[0] - return ret - #addNamespaces() - - - def addRelationship(self, relationship): - return self.addRelationships([(relationship,)])[relationship] - #addRelationship() - - - def addRelationships(self, relationships): - # relationships=[ (relationship,), ... ] - dbc = self._db.cursor() - ret = {} - # use ABORT to avoid wasting autoincrements on existing rows, - # and execute() to avoid bailing out of executemany() due to ABORT - for r in relationships: - try: - dbc.execute("INSERT OR ABORT INTO `db`.`relationship` (relationship) VALUES (LOWER(?)); SELECT LAST_INSERT_ROWID()", r) - except apsw.ConstraintError: - dbc.execute("SELECT relationship_id FROM `db`.`relationship` WHERE relationship = LOWER(?)", r[0:1]) - for row in dbc: - ret[r[0]] = row[0] - return ret - #addRelationships() - - - def addRole(self, role, description=None, coding=None, exon=None): - return self.addRoles([(role,description,coding,exon)])[role] - #addRole() - - - def addRoles(self, roles): - # roles=[ (role,description,coding,exon), ... ] - dbc = self._db.cursor() - ret = {} - # use ABORT to avoid wasting autoincrements on existing rows, - # and execute() to avoid bailing out of executemany() due to ABORT - for r in roles: - try: - dbc.execute("INSERT OR ABORT INTO `db`.`role` (role,description,coding,exon) VALUES (LOWER(?),?,?,?); SELECT LAST_INSERT_ROWID()", r) - except apsw.ConstraintError: - dbc.execute("SELECT role_id FROM `db`.`role` WHERE role = LOWER(?)", r[0:1]) - for row in dbc: - ret[r[0]] = row[0] - return ret - #addRoles() - - - def addSource(self, source): - return self.addSources([(source,)])[source] - #addSource() - - - def addSources(self, sources): - # sources=[ (source,), ... ] - dbc = self._db.cursor() - ret = {} - # use ABORT to avoid wasting autoincrements on existing rows, - # and execute() to avoid bailing out of executemany() due to ABORT - for s in sources: - try: - dbc.execute("INSERT OR ABORT INTO `db`.`source` (source) VALUES (LOWER(?)); SELECT LAST_INSERT_ROWID()", s) - except apsw.ConstraintError: - dbc.execute("SELECT source_id FROM `db`.`source` WHERE source = LOWER(?)", s[0:1]) - for row in dbc: - ret[s[0]] = row[0] - return ret - #addSources() - - - def addType(self, type): - return self.addTypes([(type,)])[type] - #addType() - - - def addTypes(self, types): - # types=[ (type,), ... ] - dbc = self._db.cursor() - ret = {} - # use ABORT to avoid wasting autoincrements on existing rows, - # and execute() to avoid bailing out of executemany() due to ABORT - for t in types: - try: - dbc.execute("INSERT OR ABORT INTO `db`.`type` (type) VALUES (LOWER(?)); SELECT LAST_INSERT_ROWID()", t) - except apsw.ConstraintError: - dbc.execute("SELECT type_id FROM `db`.`type` WHERE type = LOWER(?)", t[0:1]) - for row in dbc: - ret[t[0]] = row[0] - return ret - #addTypes() - - def addSubtypes(self, subtypes): - # types=[ (type,), ... ] - dbc = self._db.cursor() - ret = {} - # use ABORT to avoid wasting autoincrements on existing rows, - # and execute() to avoid bailing out of executemany() due to ABORT - for t in subtypes: - try: - dbc.execute("INSERT OR ABORT INTO `db`.`subtype` (subtype) VALUES (LOWER(?)); SELECT LAST_INSERT_ROWID()", t) - except apsw.ConstraintError: - dbc.execute("SELECT subtype_id FROM `db`.`subtype` WHERE subtype = LOWER(?)", t[0:1]) - for row in dbc: - ret[t[0]] = row[0] - return ret - #addTypes() - - - def deleteAll(self): - dbc = self._db.cursor() - tables = [ - 'snp_merge', 'snp_locus', 'snp_entrez_role', - 'biopolymer', 'biopolymer_name', 'biopolymer_name_name', 'biopolymer_region', - 'group', 'group_name', 'group_group', 'group_biopolymer', 'group_member_name', - 'chain', 'chain_data', - 'gwas', - ] - for table in tables: - dbc.execute("DELETE FROM `db`.`%s` WHERE source_id = %d" % (table,self.getSourceID())) - #deleteAll() - - - ################################################## - # source metadata management - - - def getSourceName(self): - return self.__class__.__name__[7:] - #getSourceName() - - - def getSourceID(self): - return self._sourceID - #getSourceID() - - - def setSourceBuilds(self, grch=None, ucschg=None): - sql = "UPDATE `db`.`source` SET grch = ?, ucschg = ?, current_ucschg = ? WHERE source_id = ?" - self._db.cursor().execute(sql, (grch, ucschg, ucschg, self.getSourceID())) - #setSourceBuilds() - - - ################################################## - # snp data management - - - def addSNPMerges(self, snpMerges): - # snpMerges=[ (rsMerged,rsCurrent), ... ] - self.prepareTableForUpdate('snp_merge') - sql = "INSERT OR IGNORE INTO `db`.`snp_merge` (rsMerged,rsCurrent,source_id) VALUES (?,?,%d)" % (self.getSourceID(),) - with self._db: - self._db.cursor().executemany(sql, snpMerges) - #addSNPMerges() - - - def addSNPLoci(self, snpLoci): - # snpLoci=[ (rs,chr,pos,validated), ... ] - self.prepareTableForUpdate('snp_locus') - sql = "INSERT OR IGNORE INTO `db`.`snp_locus` (rs,chr,pos,validated,source_id) VALUES (?,?,?,?,%d)" % (self.getSourceID(),) - with self._db: # type: ignore - self._db.cursor().executemany(sql, snpLoci) - #addSNPLoci() - - - def addChromosomeSNPLoci(self, chromosome, snpLoci): - # snpLoci=[ (rs,pos,validated), ... ] - # self.prepareTableForUpdate('snp_locus') - sql = "INSERT OR IGNORE INTO `db`.`snp_locus` (rs,chr,pos,validated,source_id) VALUES (?,%d,?,?,%d)" % (chromosome,self.getSourceID(),) - # with self._db: - self._db.cursor().executemany(sql, snpLoci) - #addChromosomeSNPLoci() - - - def addSNPEntrezRoles(self, snpRoles): - # snpRoles=[ (rs,entrez_id,role_id), ... ] - self.prepareTableForUpdate('snp_entrez_role') - sql = "INSERT OR IGNORE INTO `db`.`snp_entrez_role` (rs,entrez_id,role_id,source_id) VALUES (?,?,?,%d)" % (self.getSourceID(),) - with self._db: # type: ignore - self._db.cursor().executemany(sql, snpRoles) - #addSNPEntrezRoles() - - - ################################################## - # biopolymer data management - - - def addBiopolymers(self, biopolymers): - # biopolymers=[ (type_id,label,description), ... ] - self.prepareTableForUpdate('biopolymer') - sql = "INSERT INTO `db`.`biopolymer` (type_id,label,description,source_id) VALUES (?,?,?,%d); SELECT last_insert_rowid()" % (self.getSourceID(),) - return [ row[0] for row in self._db.cursor().executemany(sql, biopolymers) ] - #addBiopolymers() - - - def addTypedBiopolymers(self, typeID, biopolymers): - # biopolymers=[ (label,description), ... ] - #self.prepareTableForUpdate('biopolymer') - sql = "INSERT INTO `db`.`biopolymer` (type_id,label,description,source_id) VALUES (%d,?,?,%d); SELECT last_insert_rowid()" % (typeID,self.getSourceID(),) - return [ row[0] for row in self._db.cursor().executemany(sql, biopolymers) ] - #addTypedBiopolymers() - - - def addBiopolymerNames(self, biopolymerNames): - # biopolymerNames=[ (biopolymer_id,namespace_id,name), ... ] - self.prepareTableForUpdate('biopolymer_name') - sql = "INSERT OR IGNORE INTO `db`.`biopolymer_name` (biopolymer_id,namespace_id,name,source_id) VALUES (?,?,?,%d)" % (self.getSourceID(),) - self._db.cursor().executemany(sql, biopolymerNames) - #addBiopolymerNames() - - - def addBiopolymerNamespacedNames(self, namespaceID, biopolymerNames): - # biopolymerNames=[ (biopolymer_id,name), ... ] - #self.prepareTableForUpdate('biopolymer_name') - sql = "INSERT OR IGNORE INTO `db`.`biopolymer_name` (biopolymer_id,namespace_id,name,source_id) VALUES (?,%d,?,%d)" % (namespaceID,self.getSourceID(),) - self._db.cursor().executemany(sql, biopolymerNames) - #addBiopolymerNamespacedNames() - - - def addBiopolymerNameNames(self, biopolymerNameNames): - # biopolymerNameNames=[ (old_namespace_id,old_name,old_type_id,new_namespace_id,new_name), ... ] - self.prepareTableForUpdate('biopolymer_name_name') - sql = "INSERT OR IGNORE INTO `db`.`biopolymer_name_name` (namespace_id,name,type_id,new_namespace_id,new_name,source_id) VALUES (?,?,?,?,?,%d)" % (self.getSourceID(),) - self._db.cursor().executemany(sql, biopolymerNameNames) - #addBiopolymerNameNames() - - - def addBiopolymerTypedNameNamespacedNames(self, oldTypeID, newNamespaceID, biopolymerNameNames): - # biopolymerNameNames=[ (old_namespace_id,old_name,new_name), ... ] - self.prepareTableForUpdate('biopolymer_name_name') - sql = "INSERT OR IGNORE INTO `db`.`biopolymer_name_name` (namespace_id,name,type_id,new_namespace_id,new_name,source_id) VALUES (?,?,%d,%d,?,%d)" % (oldTypeID,newNamespaceID,self.getSourceID(),) - self._db.cursor().executemany(sql, biopolymerNameNames) - #addBiopolymerTypedNameNamespacedNames() - - - def addBiopolymerRegions(self, biopolymerRegions): - # biopolymerRegions=[ (biopolymer_id,ldprofile_id,chr,posMin,posMax), ... ] - self.prepareTableForUpdate('biopolymer_region') - sql = "INSERT OR IGNORE INTO `db`.`biopolymer_region` (biopolymer_id,ldprofile_id,chr,posMin,posMax,source_id) VALUES (?,?,?,?,?,%d)" % (self.getSourceID(),) - self._db.cursor().executemany(sql, biopolymerRegions) - #addBiopolymerRegions() - - - def addBiopolymerLDProfileRegions(self, ldprofileID, biopolymerRegions): - # biopolymerRegions=[ (biopolymer_id,chr,posMin,posMax), ... ] - #self.prepareTableForUpdate('biopolymer_region') - sql = "INSERT OR IGNORE INTO `db`.`biopolymer_region` (biopolymer_id,ldprofile_id,chr,posMin,posMax,source_id) VALUES (?,%d,?,?,?,%d)" % (ldprofileID,self.getSourceID(),) - self._db.cursor().executemany(sql, biopolymerRegions) - #addBiopolymerLDProfileRegions() - - - ################################################## - # group data management - - - def addGroups(self, groups): - # groups=[ (type_id,subtype_id,label,description), ... ] - self.prepareTableForUpdate('group') - sql = "INSERT INTO `db`.`group` (type_id,subtype_id,label,description,source_id) VALUES (?,?,?,?,%d); SELECT last_insert_rowid()" % (self.getSourceID(),) - return [ row[0] for row in self._db.cursor().executemany(sql, groups) ] - #addGroups() - - - def addTypedGroups(self, typeID, groups): - # groups=[ (subtype,label,description), ... ] - #self.prepareTableForUpdate('group') - sql = "INSERT INTO `db`.`group` (type_id,subtype_id,label,description,source_id) VALUES (%d,?,?,?,%d); SELECT last_insert_rowid()" % (typeID,self.getSourceID(),) - return [ row[0] for row in self._db.cursor().executemany(sql, groups) ] - #addTypedGroups() - - - def addGroupNames(self, groupNames): - # groupNames=[ (group_id,namespace_id,name), ... ] - self.prepareTableForUpdate('group_name') - sql = "INSERT OR IGNORE INTO `db`.`group_name` (group_id,namespace_id,name,source_id) VALUES (?,?,?,%d)" % (self.getSourceID(),) - self._db.cursor().executemany(sql, groupNames) - #addGroupNames() - - - def addGroupNamespacedNames(self, namespaceID, groupNames): - # groupNames=[ (group_id,name), ... ] - #self.prepareTableForUpdate('group_name') - sql = "INSERT OR IGNORE INTO `db`.`group_name` (group_id,namespace_id,name,source_id) VALUES (?,%d,?,%d)" % (namespaceID,self.getSourceID(),) - self._db.cursor().executemany(sql, groupNames) - #addGroupNamespacedNames() - - - def addGroupRelationships(self, groupRels): - # groupRels=[ (group_id,related_group_id,relationship_id,contains), ... ] - #self.prepareTableForUpdate('group_group') - # we SHOULD be able to do (?1,?2,?3) and (?2,?1,?3) with the same 3 bindings for each execution, - # but apsw or SQLite appears to treat the compound statement separately, so we have to copy the bindings - sql = "INSERT OR IGNORE INTO `db`.`group_group` (group_id,related_group_id,relationship_id,direction,contains,source_id)" - sql += " VALUES (?1,?2,?3,1,(CASE WHEN ?4 IS NULL THEN NULL WHEN ?4 > 0 THEN 1 WHEN ?4 < 0 THEN -1 ELSE 0 END),%d)" % (self.getSourceID(),) - sql += ";INSERT OR IGNORE INTO `db`.`group_group` (group_id,related_group_id,relationship_id,direction,contains,source_id)" - sql += " VALUES (?2,?1,?3,-1,(CASE WHEN ?4 IS NULL THEN NULL WHEN ?4 > 0 THEN -1 WHEN ?4 < 0 THEN 1 ELSE 0 END),%d)" % (self.getSourceID(),) - self._db.cursor().executemany(sql, (2*gr for gr in groupRels)) # type: ignore - #addGroupRelationships() - - - def addGroupParentRelationships(self, groupRels): - # groupRels=[ (group_id,related_group_id,relationship_id), ... ] - self.prepareTableForUpdate('group_group') - sql = "INSERT OR IGNORE INTO `db`.`group_group` (group_id,related_group_id,relationship_id,direction,contains,source_id)" - sql += " VALUES (?1,?2,?3,1,1,%d)" % (self.getSourceID(),) - sql += ";INSERT OR IGNORE INTO `db`.`group_group` (group_id,related_group_id,relationship_id,direction,contains,source_id)" - sql += " VALUES (?2,?1,?3,-1,-1,%d)" % (self.getSourceID(),) - self._db.cursor().executemany(sql, (2*gr for gr in groupRels)) # type: ignore - #addGroupParentRelationships() - - - def addGroupChildRelationships(self, groupRels): - # groupRels=[ (group_id,related_group_id,relationship_id), ... ] - self.prepareTableForUpdate('group_group') - sql = "INSERT OR IGNORE INTO `db`.`group_group` (group_id,related_group_id,relationship_id,direction,contains,source_id)" - sql += " VALUES (?1,?2,?3,1,-1,%d)" % (self.getSourceID(),) - sql += ";INSERT OR IGNORE INTO `db`.`group_group` (group_id,related_group_id,relationship_id,direction,contains,source_id)" - sql += " VALUES (?2,?1,?3,-1,1,%d)" % (self.getSourceID(),) - self._db.cursor().executemany(sql, (2*gr for gr in groupRels)) # type: ignore - #addGroupChildRelationships() - - - def addGroupSiblingRelationships(self, groupRels): - # groupRels=[ (group_id,related_group_id,relationship_id), ... ] - self.prepareTableForUpdate('group_group') - sql = "INSERT OR IGNORE INTO `db`.`group_group` (group_id,related_group_id,relationship_id,direction,contains,source_id)" - sql += " VALUES (?1,?2,?3,1,0,%d)" % (self.getSourceID(),) - sql += ";INSERT OR IGNORE INTO `db`.`group_group` (group_id,related_group_id,relationship_id,direction,contains,source_id)" - sql += " VALUES (?2,?1,?3,-1,0,%d)" % (self.getSourceID(),) - self._db.cursor().executemany(sql, (2*gr for gr in groupRels)) # type: ignore - #addGroupSiblingRelationships() - - - def addGroupBiopolymers(self, groupBiopolymers): - # groupBiopolymers=[ (group_id,biopolymer_id), ... ] - #self.prepareTableForUpdate('group_biopolymer') - sql = "INSERT OR IGNORE INTO `db`.`group_biopolymer` (group_id,biopolymer_id,specificity,implication,quality,source_id) VALUES (?,?,100,100,100,%d)" % (self.getSourceID(),) - self._db.cursor().executemany(sql, groupBiopolymers) - #addGroupBiopolymers() - - - def addGroupMemberNames(self, groupMemberNames): - # groupMemberNames=[ (group_id,member,type_id,namespace_id,name), ... ] - self.prepareTableForUpdate('group_member_name') - sql = "INSERT OR IGNORE INTO `db`.`group_member_name` (group_id,member,type_id,namespace_id,name,source_id) VALUES (?,?,?,?,?,%d)" % (self.getSourceID(),) - self._db.cursor().executemany(sql, groupMemberNames) - #addGroupMemberNames() - - - def addGroupMemberTypedNamespacedNames(self, typeID, namespaceID, groupMemberNames): - # groupMemberNames=[ (group_id,member,name), ... ] - self.prepareTableForUpdate('group_member_name') - sql = "INSERT OR IGNORE INTO `db`.`group_member_name` (group_id,member,type_id,namespace_id,name,source_id) VALUES (?,?,%d,%d,?,%d)" % (typeID,namespaceID,self.getSourceID(),) - self._db.cursor().executemany(sql, groupMemberNames) - #addGroupMemberTypedNamespacedNames() - - - ################################################## - # liftover data management - - - def addChains(self, old_ucschg, new_ucschg, chain_list): - # chain_list=[ (score,old_chr,old_start,old_end,new_chr,new_start,new_end,is_forward), ... ] - """ - Adds all of the chains described in chain_list and returns the - ids of the added chains. The chain_list must be an iterable - container of objects that can be inserted into the chain table - """ - self.prepareTableForUpdate('chain') - sql = "INSERT INTO `db`.`chain` (score,old_ucschg,old_chr,old_start,old_end,new_ucschg,new_chr,new_start,new_end,is_fwd,source_id)" - sql += " VALUES (?,%d,?,?,?,%d,?,?,?,?,%d); SELECT last_insert_rowid()" % (old_ucschg,new_ucschg,self.getSourceID()) - return [ row[0] for row in self._db.cursor().executemany(sql, chain_list) ] - #addChains() - - - def addChainData(self, chain_data_list): - """ - Adds all of the chain data into the chain data table - """ - self.prepareTableForUpdate('chain_data') - sql = "INSERT INTO `db`.`chain_data` (chain_id,old_start,old_end,new_start,source_id) VALUES (?,?,?,?,%d)" % (self.getSourceID(),) - self._db.cursor().executemany(sql, chain_data_list) - #addChainData() - - - ################################################## - # gwas data management - - - def addGWASAnnotations(self, gwasAnnotations): - # gwasAnnotations=[ (rs,chm,pos,trait,snps,orBeta,allele95ci,riskAfreq,pubmedID), ... ] - self.prepareTableForUpdate('gwas') - sql = "INSERT OR IGNORE INTO `db`.`gwas` (rs,chr,pos,trait,snps,orbeta,allele95ci,riskAfreq,pubmed_id,source_id) VALUES (?,?,?,?,?,?,?,?,?,%d)" % (self.getSourceID(),) - self._db.cursor().executemany(sql, gwasAnnotations) - #addGWASAnnotations() - - - ################################################## - # source utility methods - - - def zfile(self, fileName, splitChar="\n", chunkSize=1*1024*1024): - dc = zlib.decompressobj(zlib.MAX_WBITS | 32) # autodetect gzip or zlib header - with open(fileName,'rb') as filePtr: - text = "" - while dc: - data = filePtr.read(chunkSize) - if data: - decompressedData = dc.decompress(data) - text += decompressedData.decode('utf-8') - data = None - else: - text += dc.flush().decode('utf-8') - dc = None - if text: - lines = text.split(splitChar) - i,x = 0,len(lines)-1 - text = lines[x] - while i < x: - yield lines[i] - i += 1 - lines = None - #while data remains - if text: - yield text - #with fileName - #zfile() - - - def findConnectedComponents(self, neighbors): - f = set() - c = list() - for v in neighbors: - if v not in f: - f.add(v) - c.append(self._findConnectedComponents_recurse(neighbors, v, f, {v})) - return c - #findConnectedComponents() - - - def _findConnectedComponents_recurse(self, n, v, f, c): - for u in n[v]: - if u not in f: - f.add(u) - c.add(u) - self._findConnectedComponents_recurse(n, v, f, c) - return c - #_findConnectedComponents_recurse() - - - def findEdgeDisjointCliques(self, neighbors): - # neighbors = {'a':{'b','c'}, 'b':{'a'}, 'c':{'a'}, ...} - # 'a' not in neighbors['a'] - # 'b' in neighbors['a'] => 'a' in neighbors['b'] - - # clone neighbors so we can modify the local copy - n = { v:set(neighbors[v]) for v in neighbors } - c = list() - - while True: - # prune isolated vertices and extract hanging pairs - for v in n.keys(): - try: - if len(n[v]) == 0: - del n[v] - elif len(n[v]) == 1: - u, = n[v] - n[v].add(v) - c.append(n[v]) - del n[v] - n[u].remove(v) - if len(n[u]) == 0: - del n[u] - except KeyError: - pass - #foreach vertex - - # if nothing remains, we're done - if len(n) == 0: - return c - - # find maximal cliques on the remaining graph - cliques = self.findMaximalCliques(n) - - # add disjoint cliques to the solution and remove the covered edges from the graph - cliques.sort(key=len, reverse=True) - for clique in cliques: - ok = True - for v in clique: - if len(n[v] & clique) != len(clique) - 1: - ok = False - break - if ok: - c.append(clique) - for v in clique: - n[v] -= clique - #foreach clique - #loop - #findEdgeDisjointCliques() - - - def findMaximalCliques(self, neighbors): - # neighbors = {'a':{'b','c'}, 'b':{'a'}, 'c':{'a'}, ...} - # 'a' not in neighbors['a'] - # 'b' in neighbors['a'] => 'a' in neighbors['b'] - # - # this implementation of the Bron-Kerbosch algorithm incorporates the - # top-level degeneracy ordering described in: - # Listing All Maximal Cliques in Sparse Graphs in Near-optimal Time - # David Eppstein, Maarten Loeffler, Darren Strash - - # build vertex-degree and degree-vertices maps - vd = dict() - dv = list() - for v in neighbors: - d = len(neighbors[v]) - vd[v] = d - while len(dv) <= d: - dv.append(set()) - dv[d].add(v) - #foreach vertex - - # compute degeneracy ordering - o = list() - while len(dv) > 0: - for dvSet in dv: - try: - v = dvSet.pop() - except KeyError: - continue - o.append(v) - vd[v] = None - for u in neighbors[v]: - if vd[u]: - dv[vd[u]].remove(u) - vd[u] -= 1 - dv[vd[u]].add(u) - while len(dv) > 0 and len(dv[-1]) == 0: - dv.pop() - break - #for dvSet in dv (until dvSet is non-empty) - #while dv remains - vd = dv = None - - # run first recursion layer in degeneracy order - p = set(o) - x = set() - c = list() - for v in o: - self._findMaximalCliques_recurse({v}, p & neighbors[v], x & neighbors[v], neighbors, c) - p.remove(v) - x.add(v) - return c - #findMaximalCliques() - - - def _findMaximalCliques_recurse(self, r, p, x, n, c): - if len(p) == 0: - if len(x) == 0: - return c.append(r) - else: - # cursory tests yield best performance by choosing the pivot - # arbitrarily from x first if x is not empty, else p; also tried - # picking from p always, picking the pivot with highest degree, - # and picking the pivot earliest in degeneracy order - u = iter(x).next() if (len(x) > 0) else iter(p).next() - for v in (p - n[u]): - self._findMaximalCliques_recurse(r | {v}, p & n[v], x & n[v], n, c) - p.remove(v) - x.add(v) - #_findMaximalCliques_recurse() - - - def downloadFilesFromFTP(self, remHost, remFiles): - # remFiles=function(ftp) or {'filename.ext':'/path/on/remote/host/to/filename.ext',...} - # connect to source server - self.log("connecting to FTP server %s ..." % remHost) - ftp = ftplib.FTP(remHost, timeout=21600) - ftp.login() # anonymous - self.log(" OK\n") - - # if remFiles is callable, let it identify the files it wants - if hasattr(remFiles, '__call__'): - self.log("locating current files ...") - remFiles = remFiles(ftp) - self.log(" OK\n") - - # check local file sizes and times, and identify all needed remote paths - remDirs = set() - remSize = {} - remTime = {} - locSize = {} - locTime = {} - for (locPath, remFile) in remFiles.items(): - remDirs.add(remFile[0:remFile.rfind('/')]) - - remSize[remFile] = None - remTime[remFile] = None - locSize[locPath] = None - locTime[locPath] = None - if os.path.exists(locPath): - stat = os.stat(locPath) - locSize[locPath] = int(stat.st_size) - locTime[locPath] = datetime.datetime.fromtimestamp(stat.st_mtime) - - # define FTP directory list parser - # unfortunately the FTP protocol doesn't specify an easily parse-able - # format, but most servers return "ls -l"-ish space-delimited columns - # (permissions) (?) (user) (group) (size) (month) (day) (year-or-time) (filename) - now = datetime.datetime.utcnow() - def ftpDirCB(rem_dir, line): - words = line.split() - remFn = rem_dir + "/" + words[8] - if len(words) >= 9 and remFn in remSize: - remSize[remFn] = int(words[4]) - timestamp = ' '.join(words[5:8]) - try: - time = datetime.datetime.strptime(timestamp,'%b %d %Y') - except ValueError: - try: - time = datetime.datetime.strptime("%s %d" % (timestamp,now.year),'%b %d %H:%M %Y') - except ValueError: - try: - time = datetime.datetime.strptime("%s %d" % (timestamp,now.year-1),'%b %d %H:%M %Y') - except ValueError: - time = now - if ( - (time.year == now.year and time.month > now.month) or - (time.year == now.year and time.month == now.month and time.day > now.day) - ): - time = time.replace(year=now.year-1) - remTime[remFn] = time - - # check remote file sizes and times - #self.log("identifying changed files ...\n") - for remDir in remDirs: - ftp.dir(remDir, lambda x: ftpDirCB(remDir, x)) - #self.log("identifying changed files completed\n") - - # download files as needed - #self.logPush("downloading changed files ...\n") - for locPath in sorted(remFiles.keys()): - if remSize[remFiles[locPath]] == locSize[locPath] and remTime[remFiles[locPath]] <= locTime[locPath]: - self.log("%s: up to date\n" % locPath.split('/')[-1]) - else: - self.log("%s: downloading ...\n" % locPath.split('/')[-1]) - #TODO: download to temp file, then rename? - with open(locPath, 'wb') as locFile: - #ftp.cwd(remFiles[locPath][0:remFiles[locPath].rfind('/')]) - ftp.retrbinary('RETR '+remFiles[locPath], locFile.write) - - #TODO: verify file size and retry a few times if necessary - - self.log("%s: downloaded\n" % locPath.split('/')[-1]) - - modTime = time.mktime(remTime[remFiles[locPath]].utctimetuple()) - os.utime(locPath, (modTime,modTime)) - - # disconnect from source server - try: - ftp.quit() - except Exception: - ftp.close() - - #self.logPop("downloading changed files completed\n") - #downloadFilesFromFTP() - - - def getHTTPHeaders(self, remHost, remURL, reqData=None, reqHeaders=None): - class NoRedirection(urllib2.HTTPErrorProcessor): - def http_response(self, request, response): - return response - https_response = http_response - #NoRedirection - opener = urllib2.build_opener(NoRedirection) - - if reqData and reqData is not str: - reqData = urllib.parse.urlencode(reqData, True) - request = urllib2.Request(url='http://'+remHost+remURL, data=reqData, headers=(reqHeaders or {})) # type: ignore - if not reqData: - request.get_method = lambda: 'HEAD' - response = opener.open(request) - respInfo = response.info() - respHeaders = dict( (h.lower(),respInfo[h]) for h in respInfo ) - response.close() - return respHeaders - #getHTTPHeaders() - - - def downloadFilesFromHTTP(self, remHost, remFiles, reqHeaders=None, alwaysDownload=False): - # remFiles={'filename.ext':'/path/on/remote/host/to/filename.ext',...} - return self._downloadHTTP('http', remHost, remFiles, reqHeaders, alwaysDownload) - #downloadFilesFromHTTP() - - - def downloadFilesFromHTTPS(self, remHost, remFiles, reqHeaders=None, alwaysDownload=False): - # remFiles={'filename.ext':'/path/on/remote/host/to/filename.ext',...} - return self._downloadHTTP('https', remHost, remFiles, reqHeaders, alwaysDownload) - #downloadFilesFromHTTPS() - - - def _downloadHTTP(self, remProtocol, remHost, remFiles, reqHeaders, alwaysDownload): - # check local file sizes and times - remSize = {} - remTime = {} - locSize = {} - locTime = {} - for locPath in remFiles: - remSize[locPath] = None - remTime[locPath] = None - locSize[locPath] = None - locTime[locPath] = None - if os.path.exists(locPath): - stat = os.stat(locPath) - locSize[locPath] = int(stat.st_size) - locTime[locPath] = datetime.datetime.fromtimestamp(stat.st_mtime) - # check remote file sizes and times - if not alwaysDownload: - #self.log("identifying changed files ...\n") - for locPath in remFiles: - request = urllib2.Request(remProtocol+'://'+remHost+remFiles[locPath]) - request.get_method = lambda: 'HEAD' - request.add_header('user-agent', 'RitchieLab/LOKI') - for k,v in (reqHeaders or {}).items(): - request.add_header(k, v) - response = urllib2.urlopen(request) - info = response.info() - - content_length = info.get('content-length') - if content_length: - remSize[locPath] = int(content_length) - - last_modified = info.get('last-modified') - if last_modified: - try: - remTime[locPath] = datetime.datetime.strptime(last_modified,'%a, %d %b %Y %H:%M:%S %Z') - except ValueError: - remTime[locPath] = datetime.datetime.utcnow() - - response.close() - #self.log("identifying changed files completed\n") - #if not alwaysDownload - - # download files as needed - #self.logPush("downloading changed files ...\n") - for locPath in sorted(remFiles.keys()): - if remSize[locPath] and remSize[locPath] == locSize[locPath] and remTime[locPath] and remTime[locPath] <= locTime[locPath]: - self.log("%s: up to date\n" % locPath.split('/')[-1]) - else: - self.log("%s: downloading ...\n" % locPath.split('/')[-1]) - #TODO: download to temp file, then rename? - if remProtocol == 'https': - with open(locPath, 'wb') as locFile: - request = urllib2.Request(remProtocol+'://'+remHost+remFiles[locPath]) - request.add_header('user-agent', 'RitchieLab/LOKI') - for k,v in (reqHeaders or {}).items(): - request.add_header(k, v) - response = urllib2.urlopen(request) - while True: - data = response.read() - if not data: - break - locFile.write(data) - response.close() - self.log("%s: downloaded\n" % locPath.split('/')[-1]) - continue - - link = remProtocol + '://' + remHost + remFiles[locPath] - wget.download(link, bar=None) - os.rename(remFiles[locPath].rsplit('/')[-1],locPath) - - self.log("%s: downloaded\n" % locPath.split('/')[-1]) - if remTime[locPath]: - modTime = time.mktime(remTime[locPath].utctimetuple()) - os.utime(locPath, (modTime,modTime)) - #self.logPop("downloading changed files completed\n") - #_downloadHTTP() - - -#Source + + ################################################## + # constructor + + def __init__(self, lokidb): + assert isinstance(lokidb, loki_db.Database) + assert self.__class__.__name__.startswith("Source_") + self._loki = lokidb + self._db = lokidb._db + self._sourceID = self.addSource(self.getSourceName()) + assert self._sourceID > 0 + + # __init__() + + ################################################## + # source interface + + @classmethod + def getVersionString(cls): + # when checked out from SVN, these $-delimited strings are magically kept updated + rev = "$Revision$".split() + date = "$Date$".split() + stat = None + + if len(rev) > 2: + version = "r%s" % rev[1:2] + else: + stat = stat or os.stat(sys.modules[cls.__module__].__file__) # type: ignore + version = "%s" % (stat.st_size,) + + if len(date) > 3: + version += " (%s %s)" % date[1:3] + else: + stat = stat or os.stat(sys.modules[cls.__module__].__file__) # type: ignore + version += datetime.datetime.utcfromtimestamp(stat.st_mtime).strftime( + " (%Y-%m-%d)" if (len(rev) > 2) else " (%Y-%m-%d %H:%M:%S)" + ) + + return version + + # getVersionString() + + @classmethod + def getOptions(cls): + return None + + # getOptions() + + def validateOptions(self, options): + for o in options: + return "unexpected option '%s'" % o + return True + + # validateOptions() + + def download(self, options): + raise Exception("invalid LOKI Source plugin: download() not implemented") + + # download() + + def update(self, options): + raise Exception("invalid LOKI Source plugin: update() not implemented") + + # update() + + ################################################## + # context manager + + def __enter__(self): + return self._loki.__enter__() + + # __enter__() + + def __exit__(self, excType, excVal, traceback): + return self._loki.__exit__(excType, excVal, traceback) + + # __exit__() + + ################################################## + # logging + + def log(self, message=""): + return self._loki.log(message) + + # log() + + def logPush(self, message=None): + return self._loki.logPush(message) + + # logPush() + + def logPop(self, message=None): + return self._loki.logPop(message) + + # logPop() + + ################################################## + # database update + + def prepareTableForUpdate(self, table): + return self._loki.prepareTableForUpdate(table) + + # prepareTableUpdate() + + def prepareTableForQuery(self, table): + return self._loki.prepareTableForQuery(table) + + # prepareTableQuery() + + ################################################## + # metadata management + + def addLDProfile(self, ldprofile, description=None, metric=None, value=None): + return self.addLDProfiles([(ldprofile, description, metric, value)])[ldprofile] + + # addLDProfile() + + def addLDProfiles(self, ldprofiles): + # ldprofiles=[ (ldprofile,description,metric,value), ... ] + dbc = self._db.cursor() + ret = {} + # use ABORT to avoid wasting autoincrements on existing rows, + # and execute() to avoid bailing out of executemany() due to ABORT + for ld in ldprofiles: + try: + dbc.execute( + "INSERT OR ABORT INTO `db`.`ldprofile` (ldprofile,description,metric,value) VALUES (LOWER(?),?,LOWER(?),?); SELECT LAST_INSERT_ROWID()", + ld, + ) + except apsw.ConstraintError: + dbc.execute( + "SELECT ldprofile_id FROM `db`.`ldprofile` WHERE ldprofile = LOWER(?)", + ld[0:1], + ) + for row in dbc: + ret[ld[0]] = row[0] + return ret + + # addLDProfiles() + + def addNamespace(self, namespace, polygenic=0): + return self.addNamespaces([(namespace, polygenic)])[namespace] + + # addNamespace() + + def addNamespaces(self, namespaces): + # namespaces=[ (namespace,polygenic), ... ] + dbc = self._db.cursor() + ret = {} + # use ABORT to avoid wasting autoincrements on existing rows, + # and execute() to avoid bailing out of executemany() due to ABORT + for n in namespaces: + try: + dbc.execute( + "INSERT OR ABORT INTO `db`.`namespace` (namespace,polygenic) VALUES (LOWER(?),?); SELECT LAST_INSERT_ROWID()", + n, + ) + except apsw.ConstraintError: + dbc.execute( + "SELECT namespace_id FROM `db`.`namespace` WHERE namespace = LOWER(?)", + n[0:1], + ) + for row in dbc: + ret[n[0]] = row[0] + return ret + + # addNamespaces() + + def addRelationship(self, relationship): + return self.addRelationships([(relationship,)])[relationship] + + # addRelationship() + + def addRelationships(self, relationships): + # relationships=[ (relationship,), ... ] + dbc = self._db.cursor() + ret = {} + # use ABORT to avoid wasting autoincrements on existing rows, + # and execute() to avoid bailing out of executemany() due to ABORT + for r in relationships: + try: + dbc.execute( + "INSERT OR ABORT INTO `db`.`relationship` (relationship) VALUES (LOWER(?)); SELECT LAST_INSERT_ROWID()", + r, + ) + except apsw.ConstraintError: + dbc.execute( + "SELECT relationship_id FROM `db`.`relationship` WHERE relationship = LOWER(?)", + r[0:1], + ) + for row in dbc: + ret[r[0]] = row[0] + return ret + + # addRelationships() + + def addRole(self, role, description=None, coding=None, exon=None): + return self.addRoles([(role, description, coding, exon)])[role] + + # addRole() + + def addRoles(self, roles): + # roles=[ (role,description,coding,exon), ... ] + dbc = self._db.cursor() + ret = {} + # use ABORT to avoid wasting autoincrements on existing rows, + # and execute() to avoid bailing out of executemany() due to ABORT + for r in roles: + try: + dbc.execute( + "INSERT OR ABORT INTO `db`.`role` (role,description,coding,exon) VALUES (LOWER(?),?,?,?); SELECT LAST_INSERT_ROWID()", + r, + ) + except apsw.ConstraintError: + dbc.execute( + "SELECT role_id FROM `db`.`role` WHERE role = LOWER(?)", r[0:1] + ) + for row in dbc: + ret[r[0]] = row[0] + return ret + + # addRoles() + + def addSource(self, source): + return self.addSources([(source,)])[source] + + # addSource() + + def addSources(self, sources): + # sources=[ (source,), ... ] + dbc = self._db.cursor() + ret = {} + # use ABORT to avoid wasting autoincrements on existing rows, + # and execute() to avoid bailing out of executemany() due to ABORT + for s in sources: + try: + dbc.execute( + "INSERT OR ABORT INTO `db`.`source` (source) VALUES (LOWER(?)); SELECT LAST_INSERT_ROWID()", + s, + ) + except apsw.ConstraintError: + dbc.execute( + "SELECT source_id FROM `db`.`source` WHERE source = LOWER(?)", + s[0:1], + ) + for row in dbc: + ret[s[0]] = row[0] + return ret + + # addSources() + + def addType(self, type): + return self.addTypes([(type,)])[type] + + # addType() + + def addTypes(self, types): + # types=[ (type,), ... ] + dbc = self._db.cursor() + ret = {} + # use ABORT to avoid wasting autoincrements on existing rows, + # and execute() to avoid bailing out of executemany() due to ABORT + for t in types: + try: + dbc.execute( + "INSERT OR ABORT INTO `db`.`type` (type) VALUES (LOWER(?)); SELECT LAST_INSERT_ROWID()", + t, + ) + except apsw.ConstraintError: + dbc.execute( + "SELECT type_id FROM `db`.`type` WHERE type = LOWER(?)", t[0:1] + ) + for row in dbc: + ret[t[0]] = row[0] + return ret + + # addTypes() + + def addSubtypes(self, subtypes): + # types=[ (type,), ... ] + dbc = self._db.cursor() + ret = {} + # use ABORT to avoid wasting autoincrements on existing rows, + # and execute() to avoid bailing out of executemany() due to ABORT + for t in subtypes: + try: + dbc.execute( + "INSERT OR ABORT INTO `db`.`subtype` (subtype) VALUES (LOWER(?)); SELECT LAST_INSERT_ROWID()", + t, + ) + except apsw.ConstraintError: + dbc.execute( + "SELECT subtype_id FROM `db`.`subtype` WHERE subtype = LOWER(?)", + t[0:1], + ) + for row in dbc: + ret[t[0]] = row[0] + return ret + + # addTypes() + + def deleteAll(self): + dbc = self._db.cursor() + tables = [ + "snp_merge", + "snp_locus", + "snp_entrez_role", + "biopolymer", + "biopolymer_name", + "biopolymer_name_name", + "biopolymer_region", + "group", + "group_name", + "group_group", + "group_biopolymer", + "group_member_name", + "chain", + "chain_data", + "gwas", + ] + for table in tables: + dbc.execute( + "DELETE FROM `db`.`%s` WHERE source_id = %d" + % (table, self.getSourceID()) + ) + + # deleteAll() + + ################################################## + # source metadata management + + def getSourceName(self): + return self.__class__.__name__[7:] + + # getSourceName() + + def getSourceID(self): + return self._sourceID + + # getSourceID() + + def setSourceBuilds(self, grch=None, ucschg=None): + sql = "UPDATE `db`.`source` SET grch = ?, ucschg = ?, current_ucschg = ? WHERE source_id = ?" + self._db.cursor().execute(sql, (grch, ucschg, ucschg, self.getSourceID())) + + # setSourceBuilds() + + ################################################## + # snp data management + + def addSNPMerges(self, snpMerges): + # snpMerges=[ (rsMerged,rsCurrent), ... ] + self.prepareTableForUpdate("snp_merge") + sql = ( + "INSERT OR IGNORE INTO `db`.`snp_merge` (rsMerged,rsCurrent,source_id) VALUES (?,?,%d)" + % (self.getSourceID(),) + ) + with self._db: + self._db.cursor().executemany(sql, snpMerges) + + # addSNPMerges() + + def addSNPLoci(self, snpLoci): + # snpLoci=[ (rs,chr,pos,validated), ... ] + self.prepareTableForUpdate("snp_locus") + sql = ( + "INSERT OR IGNORE INTO `db`.`snp_locus` (rs,chr,pos,validated,source_id) VALUES (?,?,?,?,%d)" + % (self.getSourceID(),) + ) + with self._db: # type: ignore + self._db.cursor().executemany(sql, snpLoci) + + # addSNPLoci() + + def addChromosomeSNPLoci(self, chromosome, snpLoci): + # snpLoci=[ (rs,pos,validated), ... ] + # self.prepareTableForUpdate('snp_locus') + sql = ( + "INSERT OR IGNORE INTO `db`.`snp_locus` (rs,chr,pos,validated,source_id) VALUES (?,%d,?,?,%d)" + % ( + chromosome, + self.getSourceID(), + ) + ) + # with self._db: + self._db.cursor().executemany(sql, snpLoci) + + # addChromosomeSNPLoci() + + def addSNPEntrezRoles(self, snpRoles): + # snpRoles=[ (rs,entrez_id,role_id), ... ] + self.prepareTableForUpdate("snp_entrez_role") + sql = ( + "INSERT OR IGNORE INTO `db`.`snp_entrez_role` (rs,entrez_id,role_id,source_id) VALUES (?,?,?,%d)" + % (self.getSourceID(),) + ) + with self._db: # type: ignore + self._db.cursor().executemany(sql, snpRoles) + + # addSNPEntrezRoles() + + ################################################## + # biopolymer data management + + def addBiopolymers(self, biopolymers): + # biopolymers=[ (type_id,label,description), ... ] + self.prepareTableForUpdate("biopolymer") + sql = ( + "INSERT INTO `db`.`biopolymer` (type_id,label,description,source_id) VALUES (?,?,?,%d); SELECT last_insert_rowid()" + % (self.getSourceID(),) + ) + return [row[0] for row in self._db.cursor().executemany(sql, biopolymers)] + + # addBiopolymers() + + def addTypedBiopolymers(self, typeID, biopolymers): + # biopolymers=[ (label,description), ... ] + # self.prepareTableForUpdate('biopolymer') + sql = ( + "INSERT INTO `db`.`biopolymer` (type_id,label,description,source_id) VALUES (%d,?,?,%d); SELECT last_insert_rowid()" + % ( + typeID, + self.getSourceID(), + ) + ) + return [row[0] for row in self._db.cursor().executemany(sql, biopolymers)] + + # addTypedBiopolymers() + + def addBiopolymerNames(self, biopolymerNames): + # biopolymerNames=[ (biopolymer_id,namespace_id,name), ... ] + self.prepareTableForUpdate("biopolymer_name") + sql = ( + "INSERT OR IGNORE INTO `db`.`biopolymer_name` (biopolymer_id,namespace_id,name,source_id) VALUES (?,?,?,%d)" + % (self.getSourceID(),) + ) + self._db.cursor().executemany(sql, biopolymerNames) + + # addBiopolymerNames() + + def addBiopolymerNamespacedNames(self, namespaceID, biopolymerNames): + # biopolymerNames=[ (biopolymer_id,name), ... ] + # self.prepareTableForUpdate('biopolymer_name') + sql = ( + "INSERT OR IGNORE INTO `db`.`biopolymer_name` (biopolymer_id,namespace_id,name,source_id) VALUES (?,%d,?,%d)" + % ( + namespaceID, + self.getSourceID(), + ) + ) + self._db.cursor().executemany(sql, biopolymerNames) + + # addBiopolymerNamespacedNames() + + def addBiopolymerNameNames(self, biopolymerNameNames): + # biopolymerNameNames=[ (old_namespace_id,old_name,old_type_id,new_namespace_id,new_name), ... ] + self.prepareTableForUpdate("biopolymer_name_name") + sql = ( + "INSERT OR IGNORE INTO `db`.`biopolymer_name_name` (namespace_id,name,type_id,new_namespace_id,new_name,source_id) VALUES (?,?,?,?,?,%d)" + % (self.getSourceID(),) + ) + self._db.cursor().executemany(sql, biopolymerNameNames) + + # addBiopolymerNameNames() + + def addBiopolymerTypedNameNamespacedNames( + self, oldTypeID, newNamespaceID, biopolymerNameNames + ): + # biopolymerNameNames=[ (old_namespace_id,old_name,new_name), ... ] + self.prepareTableForUpdate("biopolymer_name_name") + sql = ( + "INSERT OR IGNORE INTO `db`.`biopolymer_name_name` (namespace_id,name,type_id,new_namespace_id,new_name,source_id) VALUES (?,?,%d,%d,?,%d)" + % ( + oldTypeID, + newNamespaceID, + self.getSourceID(), + ) + ) + self._db.cursor().executemany(sql, biopolymerNameNames) + + # addBiopolymerTypedNameNamespacedNames() + + def addBiopolymerRegions(self, biopolymerRegions): + # biopolymerRegions=[ (biopolymer_id,ldprofile_id,chr,posMin,posMax), ... ] + self.prepareTableForUpdate("biopolymer_region") + sql = ( + "INSERT OR IGNORE INTO `db`.`biopolymer_region` (biopolymer_id,ldprofile_id,chr,posMin,posMax,source_id) VALUES (?,?,?,?,?,%d)" + % (self.getSourceID(),) + ) + self._db.cursor().executemany(sql, biopolymerRegions) + + # addBiopolymerRegions() + + def addBiopolymerLDProfileRegions(self, ldprofileID, biopolymerRegions): + # biopolymerRegions=[ (biopolymer_id,chr,posMin,posMax), ... ] + # self.prepareTableForUpdate('biopolymer_region') + sql = ( + "INSERT OR IGNORE INTO `db`.`biopolymer_region` (biopolymer_id,ldprofile_id,chr,posMin,posMax,source_id) VALUES (?,%d,?,?,?,%d)" + % ( + ldprofileID, + self.getSourceID(), + ) + ) + self._db.cursor().executemany(sql, biopolymerRegions) + + # addBiopolymerLDProfileRegions() + + ################################################## + # group data management + + def addGroups(self, groups): + # groups=[ (type_id,subtype_id,label,description), ... ] + self.prepareTableForUpdate("group") + sql = ( + "INSERT INTO `db`.`group` (type_id,subtype_id,label,description,source_id) VALUES (?,?,?,?,%d); SELECT last_insert_rowid()" + % (self.getSourceID(),) + ) + return [row[0] for row in self._db.cursor().executemany(sql, groups)] + + # addGroups() + + def addTypedGroups(self, typeID, groups): + # groups=[ (subtype,label,description), ... ] + # self.prepareTableForUpdate('group') + sql = ( + "INSERT INTO `db`.`group` (type_id,subtype_id,label,description,source_id) VALUES (%d,?,?,?,%d); SELECT last_insert_rowid()" + % ( + typeID, + self.getSourceID(), + ) + ) + return [row[0] for row in self._db.cursor().executemany(sql, groups)] + + # addTypedGroups() + + def addGroupNames(self, groupNames): + # groupNames=[ (group_id,namespace_id,name), ... ] + self.prepareTableForUpdate("group_name") + sql = ( + "INSERT OR IGNORE INTO `db`.`group_name` (group_id,namespace_id,name,source_id) VALUES (?,?,?,%d)" + % (self.getSourceID(),) + ) + self._db.cursor().executemany(sql, groupNames) + + # addGroupNames() + + def addGroupNamespacedNames(self, namespaceID, groupNames): + # groupNames=[ (group_id,name), ... ] + # self.prepareTableForUpdate('group_name') + sql = ( + "INSERT OR IGNORE INTO `db`.`group_name` (group_id,namespace_id,name,source_id) VALUES (?,%d,?,%d)" + % ( + namespaceID, + self.getSourceID(), + ) + ) + self._db.cursor().executemany(sql, groupNames) + + # addGroupNamespacedNames() + + def addGroupRelationships(self, groupRels): + # groupRels=[ (group_id,related_group_id,relationship_id,contains), ... ] + # self.prepareTableForUpdate('group_group') + # we SHOULD be able to do (?1,?2,?3) and (?2,?1,?3) with the same 3 bindings for each execution, + # but apsw or SQLite appears to treat the compound statement separately, so we have to copy the bindings + sql = "INSERT OR IGNORE INTO `db`.`group_group` (group_id,related_group_id,relationship_id,direction,contains,source_id)" + sql += ( + " VALUES (?1,?2,?3,1,(CASE WHEN ?4 IS NULL THEN NULL WHEN ?4 > 0 THEN 1 WHEN ?4 < 0 THEN -1 ELSE 0 END),%d)" + % (self.getSourceID(),) + ) + sql += ";INSERT OR IGNORE INTO `db`.`group_group` (group_id,related_group_id,relationship_id,direction,contains,source_id)" + sql += ( + " VALUES (?2,?1,?3,-1,(CASE WHEN ?4 IS NULL THEN NULL WHEN ?4 > 0 THEN -1 WHEN ?4 < 0 THEN 1 ELSE 0 END),%d)" + % (self.getSourceID(),) + ) + self._db.cursor().executemany(sql, (2 * gr for gr in groupRels)) # type: ignore + + # addGroupRelationships() + + def addGroupParentRelationships(self, groupRels): + # groupRels=[ (group_id,related_group_id,relationship_id), ... ] + self.prepareTableForUpdate("group_group") + sql = "INSERT OR IGNORE INTO `db`.`group_group` (group_id,related_group_id,relationship_id,direction,contains,source_id)" + sql += " VALUES (?1,?2,?3,1,1,%d)" % (self.getSourceID(),) + sql += ";INSERT OR IGNORE INTO `db`.`group_group` (group_id,related_group_id,relationship_id,direction,contains,source_id)" + sql += " VALUES (?2,?1,?3,-1,-1,%d)" % (self.getSourceID(),) + self._db.cursor().executemany(sql, (2 * gr for gr in groupRels)) # type: ignore + + # addGroupParentRelationships() + + def addGroupChildRelationships(self, groupRels): + # groupRels=[ (group_id,related_group_id,relationship_id), ... ] + self.prepareTableForUpdate("group_group") + sql = "INSERT OR IGNORE INTO `db`.`group_group` (group_id,related_group_id,relationship_id,direction,contains,source_id)" + sql += " VALUES (?1,?2,?3,1,-1,%d)" % (self.getSourceID(),) + sql += ";INSERT OR IGNORE INTO `db`.`group_group` (group_id,related_group_id,relationship_id,direction,contains,source_id)" + sql += " VALUES (?2,?1,?3,-1,1,%d)" % (self.getSourceID(),) + self._db.cursor().executemany(sql, (2 * gr for gr in groupRels)) # type: ignore + + # addGroupChildRelationships() + + def addGroupSiblingRelationships(self, groupRels): + # groupRels=[ (group_id,related_group_id,relationship_id), ... ] + self.prepareTableForUpdate("group_group") + sql = "INSERT OR IGNORE INTO `db`.`group_group` (group_id,related_group_id,relationship_id,direction,contains,source_id)" + sql += " VALUES (?1,?2,?3,1,0,%d)" % (self.getSourceID(),) + sql += ";INSERT OR IGNORE INTO `db`.`group_group` (group_id,related_group_id,relationship_id,direction,contains,source_id)" + sql += " VALUES (?2,?1,?3,-1,0,%d)" % (self.getSourceID(),) + self._db.cursor().executemany(sql, (2 * gr for gr in groupRels)) # type: ignore + + # addGroupSiblingRelationships() + + def addGroupBiopolymers(self, groupBiopolymers): + # groupBiopolymers=[ (group_id,biopolymer_id), ... ] + # self.prepareTableForUpdate('group_biopolymer') + sql = ( + "INSERT OR IGNORE INTO `db`.`group_biopolymer` (group_id,biopolymer_id,specificity,implication,quality,source_id) VALUES (?,?,100,100,100,%d)" + % (self.getSourceID(),) + ) + self._db.cursor().executemany(sql, groupBiopolymers) + + # addGroupBiopolymers() + + def addGroupMemberNames(self, groupMemberNames): + # groupMemberNames=[ (group_id,member,type_id,namespace_id,name), ... ] + self.prepareTableForUpdate("group_member_name") + sql = ( + "INSERT OR IGNORE INTO `db`.`group_member_name` (group_id,member,type_id,namespace_id,name,source_id) VALUES (?,?,?,?,?,%d)" + % (self.getSourceID(),) + ) + self._db.cursor().executemany(sql, groupMemberNames) + + # addGroupMemberNames() + + def addGroupMemberTypedNamespacedNames(self, typeID, namespaceID, groupMemberNames): + # groupMemberNames=[ (group_id,member,name), ... ] + self.prepareTableForUpdate("group_member_name") + sql = ( + "INSERT OR IGNORE INTO `db`.`group_member_name` (group_id,member,type_id,namespace_id,name,source_id) VALUES (?,?,%d,%d,?,%d)" + % ( + typeID, + namespaceID, + self.getSourceID(), + ) + ) + self._db.cursor().executemany(sql, groupMemberNames) + + # addGroupMemberTypedNamespacedNames() + + ################################################## + # liftover data management + + def addChains(self, old_ucschg, new_ucschg, chain_list): + # chain_list=[ (score,old_chr,old_start,old_end,new_chr,new_start,new_end,is_forward), ... ] + """ + Adds all of the chains described in chain_list and returns the + ids of the added chains. The chain_list must be an iterable + container of objects that can be inserted into the chain table + """ + self.prepareTableForUpdate("chain") + sql = "INSERT INTO `db`.`chain` (score,old_ucschg,old_chr,old_start,old_end,new_ucschg,new_chr,new_start,new_end,is_fwd,source_id)" + sql += " VALUES (?,%d,?,?,?,%d,?,?,?,?,%d); SELECT last_insert_rowid()" % ( + old_ucschg, + new_ucschg, + self.getSourceID(), + ) + return [row[0] for row in self._db.cursor().executemany(sql, chain_list)] + + # addChains() + + def addChainData(self, chain_data_list): + """ + Adds all of the chain data into the chain data table + """ + self.prepareTableForUpdate("chain_data") + sql = ( + "INSERT INTO `db`.`chain_data` (chain_id,old_start,old_end,new_start,source_id) VALUES (?,?,?,?,%d)" + % (self.getSourceID(),) + ) + self._db.cursor().executemany(sql, chain_data_list) + + # addChainData() + + ################################################## + # gwas data management + + def addGWASAnnotations(self, gwasAnnotations): + # gwasAnnotations=[ (rs,chm,pos,trait,snps,orBeta,allele95ci,riskAfreq,pubmedID), ... ] + self.prepareTableForUpdate("gwas") + sql = ( + "INSERT OR IGNORE INTO `db`.`gwas` (rs,chr,pos,trait,snps,orbeta,allele95ci,riskAfreq,pubmed_id,source_id) VALUES (?,?,?,?,?,?,?,?,?,%d)" + % (self.getSourceID(),) + ) + self._db.cursor().executemany(sql, gwasAnnotations) + + # addGWASAnnotations() + + ################################################## + # source utility methods + + def zfile(self, fileName, splitChar="\n", chunkSize=1 * 1024 * 1024): + dc = zlib.decompressobj(zlib.MAX_WBITS | 32) # autodetect gzip or zlib header + with open(fileName, "rb") as filePtr: + text = "" + while dc: + data = filePtr.read(chunkSize) + if data: + decompressedData = dc.decompress(data) + text += decompressedData.decode("utf-8") + data = None + else: + text += dc.flush().decode("utf-8") + dc = None + if text: + lines = text.split(splitChar) + i, x = 0, len(lines) - 1 + text = lines[x] + while i < x: + yield lines[i] + i += 1 + lines = None + # while data remains + if text: + yield text + # with fileName + + # zfile() + + def findConnectedComponents(self, neighbors): + f = set() + c = list() + for v in neighbors: + if v not in f: + f.add(v) + c.append(self._findConnectedComponents_recurse(neighbors, v, f, {v})) + return c + + # findConnectedComponents() + + def _findConnectedComponents_recurse(self, n, v, f, c): + for u in n[v]: + if u not in f: + f.add(u) + c.add(u) + self._findConnectedComponents_recurse(n, v, f, c) + return c + + # _findConnectedComponents_recurse() + + def findEdgeDisjointCliques(self, neighbors): + # neighbors = {'a':{'b','c'}, 'b':{'a'}, 'c':{'a'}, ...} + # 'a' not in neighbors['a'] + # 'b' in neighbors['a'] => 'a' in neighbors['b'] + + # clone neighbors so we can modify the local copy + n = {v: set(neighbors[v]) for v in neighbors} + c = list() + + while True: + # prune isolated vertices and extract hanging pairs + for v in n.keys(): + try: + if len(n[v]) == 0: + del n[v] + elif len(n[v]) == 1: + (u,) = n[v] + n[v].add(v) + c.append(n[v]) + del n[v] + n[u].remove(v) + if len(n[u]) == 0: + del n[u] + except KeyError: + pass + # foreach vertex + + # if nothing remains, we're done + if len(n) == 0: + return c + + # find maximal cliques on the remaining graph + cliques = self.findMaximalCliques(n) + + # add disjoint cliques to the solution and remove the covered edges from the graph + cliques.sort(key=len, reverse=True) + for clique in cliques: + ok = True + for v in clique: + if len(n[v] & clique) != len(clique) - 1: + ok = False + break + if ok: + c.append(clique) + for v in clique: + n[v] -= clique + # foreach clique + # loop + + # findEdgeDisjointCliques() + + def findMaximalCliques(self, neighbors): + # neighbors = {'a':{'b','c'}, 'b':{'a'}, 'c':{'a'}, ...} + # 'a' not in neighbors['a'] + # 'b' in neighbors['a'] => 'a' in neighbors['b'] + # + # this implementation of the Bron-Kerbosch algorithm incorporates the + # top-level degeneracy ordering described in: + # Listing All Maximal Cliques in Sparse Graphs in Near-optimal Time + # David Eppstein, Maarten Loeffler, Darren Strash + + # build vertex-degree and degree-vertices maps + vd = dict() + dv = list() + for v in neighbors: + d = len(neighbors[v]) + vd[v] = d + while len(dv) <= d: + dv.append(set()) + dv[d].add(v) + # foreach vertex + + # compute degeneracy ordering + o = list() + while len(dv) > 0: + for dvSet in dv: + try: + v = dvSet.pop() + except KeyError: + continue + o.append(v) + vd[v] = None + for u in neighbors[v]: + if vd[u]: + dv[vd[u]].remove(u) + vd[u] -= 1 + dv[vd[u]].add(u) + while len(dv) > 0 and len(dv[-1]) == 0: + dv.pop() + break + # for dvSet in dv (until dvSet is non-empty) + # while dv remains + vd = dv = None + + # run first recursion layer in degeneracy order + p = set(o) + x = set() + c = list() + for v in o: + self._findMaximalCliques_recurse( + {v}, p & neighbors[v], x & neighbors[v], neighbors, c + ) + p.remove(v) + x.add(v) + return c + + # findMaximalCliques() + + def _findMaximalCliques_recurse(self, r, p, x, n, c): + if len(p) == 0: + if len(x) == 0: + return c.append(r) + else: + # cursory tests yield best performance by choosing the pivot + # arbitrarily from x first if x is not empty, else p; also tried + # picking from p always, picking the pivot with highest degree, + # and picking the pivot earliest in degeneracy order + u = iter(x).next() if (len(x) > 0) else iter(p).next() + for v in p - n[u]: + self._findMaximalCliques_recurse(r | {v}, p & n[v], x & n[v], n, c) + p.remove(v) + x.add(v) + + # _findMaximalCliques_recurse() + + def downloadFilesFromFTP(self, remHost, remFiles): + # remFiles=function(ftp) or {'filename.ext':'/path/on/remote/host/to/filename.ext',...} + # connect to source server + self.log("connecting to FTP server %s ..." % remHost) + ftp = ftplib.FTP(remHost, timeout=21600) + ftp.login() # anonymous + self.log(" OK\n") + + # if remFiles is callable, let it identify the files it wants + if hasattr(remFiles, "__call__"): + self.log("locating current files ...") + remFiles = remFiles(ftp) + self.log(" OK\n") + + # check local file sizes and times, and identify all needed remote paths + remDirs = set() + remSize = {} + remTime = {} + locSize = {} + locTime = {} + for locPath, remFile in remFiles.items(): + remDirs.add(remFile[0 : remFile.rfind("/")]) + + remSize[remFile] = None + remTime[remFile] = None + locSize[locPath] = None + locTime[locPath] = None + if os.path.exists(locPath): + stat = os.stat(locPath) + locSize[locPath] = int(stat.st_size) + locTime[locPath] = datetime.datetime.fromtimestamp(stat.st_mtime) + + # define FTP directory list parser + # unfortunately the FTP protocol doesn't specify an easily parse-able + # format, but most servers return "ls -l"-ish space-delimited columns + # (permissions) (?) (user) (group) (size) (month) (day) (year-or-time) (filename) + now = datetime.datetime.utcnow() + + def ftpDirCB(rem_dir, line): + words = line.split() + remFn = rem_dir + "/" + words[8] + if len(words) >= 9 and remFn in remSize: + remSize[remFn] = int(words[4]) + timestamp = " ".join(words[5:8]) + try: + time = datetime.datetime.strptime(timestamp, "%b %d %Y") + except ValueError: + try: + time = datetime.datetime.strptime( + "%s %d" % (timestamp, now.year), "%b %d %H:%M %Y" + ) + except ValueError: + try: + time = datetime.datetime.strptime( + "%s %d" % (timestamp, now.year - 1), "%b %d %H:%M %Y" + ) + except ValueError: + time = now + if (time.year == now.year and time.month > now.month) or ( + time.year == now.year + and time.month == now.month + and time.day > now.day + ): + time = time.replace(year=now.year - 1) + remTime[remFn] = time + + # check remote file sizes and times + # self.log("identifying changed files ...\n") + for remDir in remDirs: + ftp.dir(remDir, lambda x: ftpDirCB(remDir, x)) + # self.log("identifying changed files completed\n") + + # download files as needed + # self.logPush("downloading changed files ...\n") + for locPath in sorted(remFiles.keys()): + if ( + remSize[remFiles[locPath]] == locSize[locPath] + and remTime[remFiles[locPath]] <= locTime[locPath] + ): + self.log("%s: up to date\n" % locPath.split("/")[-1]) + else: + self.log("%s: downloading ...\n" % locPath.split("/")[-1]) + # TODO: download to temp file, then rename? + with open(locPath, "wb") as locFile: + # ftp.cwd(remFiles[locPath][0:remFiles[locPath].rfind('/')]) + ftp.retrbinary("RETR " + remFiles[locPath], locFile.write) + + # TODO: verify file size and retry a few times if necessary + + self.log("%s: downloaded\n" % locPath.split("/")[-1]) + + modTime = time.mktime(remTime[remFiles[locPath]].utctimetuple()) + os.utime(locPath, (modTime, modTime)) + + # disconnect from source server + try: + ftp.quit() + except Exception: + ftp.close() + + # self.logPop("downloading changed files completed\n") + + # downloadFilesFromFTP() + + def getHTTPHeaders(self, remHost, remURL, reqData=None, reqHeaders=None): + class NoRedirection(urllib2.HTTPErrorProcessor): + def http_response(self, request, response): + return response + + https_response = http_response + + # NoRedirection + opener = urllib2.build_opener(NoRedirection) + + if reqData and reqData is not str: + reqData = urllib.parse.urlencode(reqData, True) + request = urllib2.Request(url="http://" + remHost + remURL, data=reqData, headers=(reqHeaders or {})) # type: ignore + if not reqData: + request.get_method = lambda: "HEAD" + response = opener.open(request) + respInfo = response.info() + respHeaders = dict((h.lower(), respInfo[h]) for h in respInfo) + response.close() + return respHeaders + + # getHTTPHeaders() + + def downloadFilesFromHTTP( + self, remHost, remFiles, reqHeaders=None, alwaysDownload=False + ): + # remFiles={'filename.ext':'/path/on/remote/host/to/filename.ext',...} + return self._downloadHTTP("http", remHost, remFiles, reqHeaders, alwaysDownload) + + # downloadFilesFromHTTP() + + def downloadFilesFromHTTPS( + self, remHost, remFiles, reqHeaders=None, alwaysDownload=False + ): + # remFiles={'filename.ext':'/path/on/remote/host/to/filename.ext',...} + return self._downloadHTTP( + "https", remHost, remFiles, reqHeaders, alwaysDownload + ) + + # downloadFilesFromHTTPS() + + def _downloadHTTP(self, remProtocol, remHost, remFiles, reqHeaders, alwaysDownload): + # check local file sizes and times + remSize = {} + remTime = {} + locSize = {} + locTime = {} + for locPath in remFiles: + remSize[locPath] = None + remTime[locPath] = None + locSize[locPath] = None + locTime[locPath] = None + if os.path.exists(locPath): + stat = os.stat(locPath) + locSize[locPath] = int(stat.st_size) + locTime[locPath] = datetime.datetime.fromtimestamp(stat.st_mtime) + # check remote file sizes and times + if not alwaysDownload: + # self.log("identifying changed files ...\n") + for locPath in remFiles: + request = urllib2.Request( + remProtocol + "://" + remHost + remFiles[locPath] + ) + request.get_method = lambda: "HEAD" + request.add_header("user-agent", "RitchieLab/LOKI") + for k, v in (reqHeaders or {}).items(): + request.add_header(k, v) + response = urllib2.urlopen(request) + info = response.info() + + content_length = info.get("content-length") + if content_length: + remSize[locPath] = int(content_length) + + last_modified = info.get("last-modified") + if last_modified: + try: + remTime[locPath] = datetime.datetime.strptime( + last_modified, "%a, %d %b %Y %H:%M:%S %Z" + ) + except ValueError: + remTime[locPath] = datetime.datetime.utcnow() + + response.close() + # self.log("identifying changed files completed\n") + # if not alwaysDownload + + # download files as needed + # self.logPush("downloading changed files ...\n") + for locPath in sorted(remFiles.keys()): + if ( + remSize[locPath] + and remSize[locPath] == locSize[locPath] + and remTime[locPath] + and remTime[locPath] <= locTime[locPath] + ): + self.log("%s: up to date\n" % locPath.split("/")[-1]) + else: + self.log("%s: downloading ...\n" % locPath.split("/")[-1]) + # TODO: download to temp file, then rename? + if remProtocol == "https": + with open(locPath, "wb") as locFile: + request = urllib2.Request( + remProtocol + "://" + remHost + remFiles[locPath] + ) + request.add_header("user-agent", "RitchieLab/LOKI") + for k, v in (reqHeaders or {}).items(): + request.add_header(k, v) + response = urllib2.urlopen(request) + while True: + data = response.read() + if not data: + break + locFile.write(data) + response.close() + self.log("%s: downloaded\n" % locPath.split("/")[-1]) + continue + + link = remProtocol + "://" + remHost + remFiles[locPath] + wget.download(link, bar=None) + os.rename(remFiles[locPath].rsplit("/")[-1], locPath) + + self.log("%s: downloaded\n" % locPath.split("/")[-1]) + if remTime[locPath]: + modTime = time.mktime(remTime[locPath].utctimetuple()) + os.utime(locPath, (modTime, modTime)) + # self.logPop("downloading changed files completed\n") + + # _downloadHTTP() + + +# Source diff --git a/loki/loki_updater.py b/loki/loki_updater.py index c4d241a..f724624 100644 --- a/loki/loki_updater.py +++ b/loki/loki_updater.py @@ -16,708 +16,898 @@ class Updater(object): - - - ################################################## - # constructor - - - def __init__(self, lokidb, is_test=False): - assert(isinstance(lokidb, loki_db.Database)) - self._is_test = is_test - self._loki = lokidb - self._db = lokidb._db - self._sourceLoaders = {} - self._sourceClasses = dict() - self._sourceObjects = dict() - self._sourceOptions = dict() - self._filehash = dict() - self._updating = False - self._tablesUpdated = set() - self._tablesDeindexed = set() - self.lock = Lock() - #__init__() - - - ################################################## - # logging - - - def log(self, message=""): - return self._loki.log(message) - #log() - - - def logPush(self, message=None): - return self._loki.logPush(message) - #logPush() - - - def logPop(self, message=None): - return self._loki.logPop(message) - #logPop() - - - ################################################## - # database update - - - def flagTableUpdate(self, table): - self._tablesUpdated.add(table) - #flagTableUpdate() - - - def prepareTableForUpdate(self, table): - if self._updating: - self.flagTableUpdate(table) - if table not in self._tablesDeindexed: - #print "deindexing %s" % table #DEBUG - self._tablesDeindexed.add(table) - self._loki.dropDatabaseIndices(None, 'db', table) - #prepareTableForUpdate() - - - def prepareTableForQuery(self, table): - if self._updating: - if table in self._tablesDeindexed: - #print "reindexing %s" % table DEBUG - self._tablesDeindexed.remove(table) - self._loki.createDatabaseIndices(None, 'db', table) - #prepareTableForQuery() - - - def findSourceModules(self): - if not self._sourceLoaders: - self._sourceLoaders = {} - loader_path = loaders.__path__ - if self._is_test: - loader_path = [os.path.join(loader, "test") for loader in loaders.__path__] - for path in loader_path: - for srcModuleName in os.listdir(path): - if srcModuleName.startswith('loki_source_'): - self._sourceLoaders[srcModuleName[12:-3]] = 1 - #findSourceModules() - - - def getSourceModules(self): - self.findSourceModules() - return self._sourceLoaders.keys() - #getSourceModules() - - - def loadSourceModules(self, sources=None): - self.findSourceModules() - srcSet = set() - for srcName in (set(sources) if sources else self._sourceLoaders.keys()): - if srcName not in self._sourceClasses: - if srcName not in self._sourceLoaders: - self.log("WARNING: unknown source '%s'\n" % srcName) - continue - #if module not available - srcModule = importlib.import_module('%s.loki_source_%s' % (loaders.__name__, srcName)) - srcClass = getattr(srcModule, 'Source_%s' % srcName) - if not issubclass(srcClass, loki_source.Source): - self.log("WARNING: invalid module for source '%s'\n" % srcName) - continue - self._sourceClasses[srcName] = srcClass - #if module class not loaded - srcSet.add(srcName) - #foreach source - return srcSet - #loadSourceModules() - - - def getSourceModuleVersions(self, sources=None): - srcSet = self.loadSourceModules(sources) - return { srcName : self._sourceClasses[srcName].getVersionString() for srcName in srcSet } - #getSourceModuleVersions() - - - def getSourceModuleOptions(self, sources=None): - srcSet = self.loadSourceModules(sources) - return { srcName : self._sourceClasses[srcName].getOptions() for srcName in srcSet } - #getSourceModuleOptions() - - - def attachSourceModules(self, sources=None): - sources = self.loadSourceModules(sources) - srcSet = set() - for srcName in sources: - if srcName not in self._sourceObjects: - if srcName not in self._sourceClasses: - raise Exception("loadSourceModules() reported false positive for '%s'" % srcName) - self._sourceObjects[srcName] = self._sourceClasses[srcName](self._loki) - #if module not instantiated - srcSet.add(srcName) - #foreach source - return srcSet - #attachSourceModules() - - def downloadAndHash(self, iwd, srcName, srcOptions): - srcObj = self._sourceObjects[srcName] - srcID = srcObj.getSourceID() - options = self._sourceOptions[srcName] - - try: - self.log("downloading %s data ...\n" % srcName) - # switch to a temp subdirectory for this source - path = os.path.join(iwd, srcName) - if not os.path.exists(path): - os.makedirs(path) - downloadedFiles = srcObj.download(options, path) - self.log("downloading %s data completed\n" % srcName) - - # calculate source file metadata - # all timestamps are assumed to be in UTC, but if a source - # provides file timestamps with no TZ (like via FTP) we use them - # as-is and assume they're supposed to be UTC - self.log("analyzing %s data files ...\n" % srcName) - for filename in downloadedFiles: - stat = os.stat(filename) - md5 = hashlib.md5() - with open(filename,'rb') as f: - chunk = f.read(8*1024*1024) - while chunk: - md5.update(chunk) - chunk = f.read(8*1024*1024) - self.lock.acquire() - self._filehash[filename] = (filename, int(stat.st_size), int(stat.st_mtime), md5.hexdigest()) - self.lock.release() - self.log("analyzing %s data files completed\n" % srcName) - except: - self.log("failed loading %s\n" % srcName) - # ToDo: determine how to handle failures - #downloadAndHash() - - - def updateDatabase(self, sources=None, sourceOptions=None, cacheOnly=False, forceUpdate=False): - if self._updating: - raise Exception("_updating set before updateDatabase()") - self._loki.testDatabaseWriteable() - if self._loki.getDatabaseSetting('finalized',int): - raise Exception("cannot update a finalized database") - - # check for extraneous options - self.logPush("preparing for update ...\n") - srcSet = self.attachSourceModules(sources) - srcOpts = sourceOptions or {} - for srcName in srcOpts.keys(): - if srcName not in srcSet: - self.log("WARNING: not updating from source '%s' for which options were supplied\n" % srcName) - logIndent = self.logPop("preparing for update completed\n") - - # update all specified sources - iwd = os.path.abspath(os.getcwd()) - self._updating = True - self._tablesUpdated = set() - self._tablesDeindexed = set() - srcErrors = set() - cursor = self._db.cursor() - cursor.execute("SAVEPOINT 'updateDatabase'") - try: - for srcName in sorted(srcSet): - srcObj = self._sourceObjects[srcName] - srcID = srcObj.getSourceID() - - # validate options, if any - prevOptions = dict() - for row in cursor.execute("SELECT option, value FROM `db`.`source_option` WHERE source_id = ?", (srcID,)): - prevOptions[str(row[0])] = str(row[1]) - options = srcOpts.get(srcName, prevOptions).copy() - optionsList = sorted(options) - if optionsList: - self.logPush("%s %s options ...\n" % (("validating" if (srcName in srcOpts) else "loading prior"), srcName)) - msg = srcObj.validateOptions(options) - if msg != True: - raise Exception(msg) - if optionsList: - for opt in optionsList: - self.log("%s = %s\n" % (opt,options[opt])) - self.logPop("... OK\n") - - #temp for now but should replace options everywhere below - self._sourceOptions[srcName] = options - - downloadAndHashThreads = {} - srcSetsToDownload = sorted(srcSet) - for srcName in srcSetsToDownload: - # download files into a local cache - if not cacheOnly: - downloadAndHashThreads[srcName] = Thread(target=self.downloadAndHash, args=(iwd, srcName, self._sourceOptions[srcName],)) - downloadAndHashThreads[srcName].start() - - for srcName in downloadAndHashThreads.keys(): - downloadAndHashThreads[srcName].join() - self.log(srcName + " rejoined main thread\n") - - for srcName in srcSetsToDownload: - srcObj = self._sourceObjects[srcName] - srcID = srcObj.getSourceID() - options = self._sourceOptions[srcName] - path = os.path.join(iwd, srcName) - - cursor.execute("SAVEPOINT 'updateDatabase_%s'" % (srcName,)) - - try: - # compare current loader version, options and file metadata to the last update - skip = not forceUpdate - last = '?' - if skip: - for row in cursor.execute("SELECT version, DATETIME(updated,'localtime') FROM `db`.`source` WHERE source_id = ?", (srcID,)): - skip = skip and (row[0] == srcObj.getVersionString()) - last = row[1] - if skip: - n = 0 - for row in cursor.execute("SELECT option, value FROM `db`.`source_option` WHERE source_id = ?", (srcID,)): - n += 1 - skip = skip and (row[0] in options) and (row[1] == options[row[0]]) - skip = skip and (n == len(options)) - if skip: - n = 0 - for row in cursor.execute("SELECT filename, size, md5 FROM `db`.`source_file` WHERE source_id = ?", (srcID,)): - n += 1 - skip = skip and (row[0] in self._filehash) and (row[1] == self._filehash[row[0]][1]) and (row[2] == self._filehash[row[0]][3]) - skip = skip and (n == len(self._filehash)) - - # skip the update if the current loader and all source file versions match the last update - if skip: - self.log("skipping %s update, no data or software changes since %s\n" % (srcName,last)) - else: - # process new files (or old files with a new loader) - self.logPush("processing %s data ...\n" % srcName) - - cursor.execute("DELETE FROM `db`.`warning` WHERE source_id = ?", (srcID,)) - srcObj.update(options, path) - cursor.execute("UPDATE `db`.`source` SET updated = DATETIME('now'), version = ? WHERE source_id = ?", (srcObj.getVersionString(), srcID)) - - cursor.execute("DELETE FROM `db`.`source_option` WHERE source_id = ?", (srcID,)) - sql = "INSERT INTO `db`.`source_option` (source_id, option, value) VALUES (%d,?,?)" % srcID - cursor.executemany(sql, options.items()) - - cursor.execute("DELETE FROM `db`.`source_file` WHERE source_id = ?", (srcID,)) - sql = "INSERT INTO `db`.`source_file` (source_id, filename, size, modified, md5) VALUES (%d,?,?,DATETIME(?,'unixepoch'),?)" % srcID - cursor.executemany(sql, self._filehash.values()) - - self.logPop("processing %s data completed\n" % srcName) - #if skip - except: - srcErrors.add(srcName) - excType,excVal,excTrace = sys.exc_info() - while self.logPop() > logIndent: - pass - self.logPush("ERROR: failed to update %s\n" % (srcName,)) - if excTrace: - for line in traceback.format_list(traceback.extract_tb(excTrace)[-1:]): - self.log(line) - for line in traceback.format_exception_only(excType,excVal): - self.log(line) - self.logPop() - cursor.execute("ROLLBACK TRANSACTION TO SAVEPOINT 'updateDatabase_%s'" % (srcName,)) - finally: - cursor.execute("RELEASE SAVEPOINT 'updateDatabase_%s'" % (srcName,)) - #try/except/finally - - # remove subdirectory to free up some space - shutil.rmtree(path) - #foreach source - - # pull the latest GRCh/UCSChg conversions - # http://genome.ucsc.edu/FAQ/FAQreleases.html - # http://genome.ucsc.edu/goldenPath/releaseLog.html - # TODO: find a better machine-readable source for this data - if not cacheOnly: - self.log("updating GRCh:UCSChg genome build identities ...\n") - import urllib.request as urllib2 - import re - response = urllib2.urlopen('http://genome.ucsc.edu/FAQ/FAQreleases.html') - page = "" - while True: - data = response.read() - if not data: - break - page += data.decode() - rowHuman = False - for tablerow in re.finditer(r'.*?', page, re.IGNORECASE | re.DOTALL): - cols = tuple(match.group()[4:-5].strip().lower() for match in re.finditer(r'.*?', tablerow.group(), re.IGNORECASE | re.DOTALL)) - if cols and ((cols[0] == 'human') or (rowHuman and (cols[0] in ('',' ')))): - rowHuman = True - grch = ucschg = None - try: - if cols[1].startswith('hg'): - ucschg = int(cols[1][2:]) - if cols[3].startswith('genome reference consortium grch'): - grch = int(cols[3][32:]) - if cols[3].startswith('ncbi build '): - grch = int(cols[3][11:]) - except: - pass - if grch and ucschg: - cursor.execute("INSERT OR REPLACE INTO `db`.`grch_ucschg` (grch,ucschg) VALUES (?,?)", (grch,ucschg)) - else: - rowHuman = False - #foreach tablerow - self.log("updating GRCh:UCSChg genome build identities completed\n") - #if not cacheOnly - - # cross-map GRCh/UCSChg build versions for all sources - ucscGRC = collections.defaultdict(int) - for row in self._db.cursor().execute("SELECT grch,ucschg FROM `db`.`grch_ucschg`"): - ucscGRC[row[1]] = max(row[0], ucscGRC[row[1]]) - cursor.execute("UPDATE `db`.`source` SET grch = ? WHERE grch IS NULL AND ucschg = ?", (row[0],row[1])) - cursor.execute("UPDATE `db`.`source` SET ucschg = ? WHERE ucschg IS NULL AND grch = ?", (row[1],row[0])) - cursor.execute("UPDATE `db`.`source` SET current_ucschg = ucschg WHERE current_ucschg IS NULL") - - # check for any source with an unrecognized GRCh build - mismatch = False - for row in cursor.execute("SELECT source, grch, ucschg FROM `db`.`source` WHERE (grch IS NULL) != (ucschg IS NULL)"): - self.log("WARNING: unrecognized genome build for '%s' (NCBI GRCh%s, UCSC hg%s)\n" % (row[0],(row[1] or "?"),(row[2] or "?"))) - mismatch = True - if mismatch: - self.log("WARNING: database may contain incomparable genome positions!\n") - - # check all sources' UCSChg build versions and set the latest as the target - hgSources = collections.defaultdict(set) - for row in cursor.execute("SELECT source_id, current_ucschg FROM `db`.`source` WHERE current_ucschg IS NOT NULL"): - hgSources[row[1]].add(row[0]) - if hgSources: - targetHG = max(hgSources) - self.log("database genome build: GRCh%s / UCSChg%s\n" % (ucscGRC.get(targetHG,'?'), targetHG)) - targetUpdated = (self._loki.getDatabaseSetting('ucschg',int) != targetHG) - self._loki.setDatabaseSetting('ucschg', targetHG) - - # liftOver sources with old build versions, if there are any - if len(hgSources) > 1: - locusSources = set(row[0] for row in cursor.execute("SELECT DISTINCT source_id FROM `db`.`snp_locus`")) - regionSources = set(row[0] for row in cursor.execute("SELECT DISTINCT source_id FROM `db`.`biopolymer_region`")) - chainsUpdated = ('grch_ucschg' in self._tablesUpdated or 'chain' in self._tablesUpdated or 'chain_data' in self._tablesUpdated) - for oldHG in sorted(hgSources): - if oldHG == targetHG: - continue - if not self._loki.hasLiftOverChains(oldHG, targetHG): - self.log("ERROR: no chains available to lift hg%d to hg%d\n" % (oldHG, targetHG)) - continue - - if targetUpdated or chainsUpdated or 'snp_locus' in self._tablesUpdated: - sourceIDs = hgSources[oldHG] & locusSources - if sourceIDs: - self.liftOverSNPLoci(oldHG, targetHG, sourceIDs) - if targetUpdated or chainsUpdated or 'biopolymer_region' in self._tablesUpdated: - sourceIDs = hgSources[oldHG] & regionSources - if sourceIDs: - self.liftOverRegions(oldHG, targetHG, sourceIDs) - - sql = "UPDATE `db`.`source` SET current_ucschg = %d WHERE source_id = ?" % targetHG - cursor.executemany(sql, ((sourceID,) for sourceID in hgSources[oldHG])) - #foreach old build - #if any old builds - - # post-process as needed - #self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG - if 'snp_merge' in self._tablesUpdated: - self.cleanupSNPMerges() - #self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG - if 'snp_merge' in self._tablesUpdated or 'snp_locus' in self._tablesUpdated: - self.updateMergedSNPLoci() - #self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG - if 'snp_locus' in self._tablesUpdated: - self.cleanupSNPLoci() - #self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG - if 'snp_merge' in self._tablesUpdated or 'snp_entrez_role' in self._tablesUpdated: - self.updateMergedSNPEntrezRoles() - #self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG - if 'snp_entrez_role' in self._tablesUpdated: - self.cleanupSNPEntrezRoles() - #self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG - if 'snp_merge' in self._tablesUpdated or 'gwas' in self._tablesUpdated: - self.updateMergedGWASAnnotations() - #self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG - if 'biopolymer_name' in self._tablesUpdated or 'biopolymer_name_name' in self._tablesUpdated: - self.resolveBiopolymerNames() - #self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG - if 'biopolymer_name' in self._tablesUpdated or 'snp_entrez_role' in self._tablesUpdated: - self.resolveSNPBiopolymerRoles() - #self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG - if 'biopolymer_name' in self._tablesUpdated or 'group_member_name' in self._tablesUpdated: - self.resolveGroupMembers() - #self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG - if 'biopolymer_region' in self._tablesUpdated: - self.updateBiopolymerZones() - #self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG - - # reindex all remaining tables - if self._tablesDeindexed: - self._loki.createDatabaseIndices(None, 'db', self._tablesDeindexed) - if self._tablesUpdated: - self._loki.setDatabaseSetting('optimized',0) - self.log("updating database completed\n") - except: - excType,excVal,excTrace = sys.exc_info() - while self.logPop() > logIndent: - pass - self.logPush("ERROR: failed to update the database\n") - if excTrace: - for line in traceback.format_list(traceback.extract_tb(excTrace)[-1:]): - self.log(line) - for line in traceback.format_exception_only(excType,excVal): - self.log(line) - self.logPop() - cursor.execute("ROLLBACK TRANSACTION TO SAVEPOINT 'updateDatabase'") - finally: - cursor.execute("RELEASE SAVEPOINT 'updateDatabase'") - self._updating = False - self._tablesUpdated = set() - self._tablesDeindexed = set() - os.chdir(iwd) - #try/except/finally - - # report and return - if srcErrors: - self.logPush("WARNING: data from these sources was not updated:\n") - for srcName in sorted(srcErrors): - self.log("%s\n" % srcName) - self.logPop() - return False - return True - #updateDatabase() - - - def liftOverSNPLoci(self, oldHG, newHG, sourceIDs): - self.log("lifting over SNP loci from hg%d to hg%d ..." % (oldHG,newHG)) - self.prepareTableForUpdate('snp_locus') - cursor = self._db.cursor() - numLift = numNull = 0 - tally = dict() - trash = set() - - # identify range of _ROWID_ in snp_locus - # (two separate queries is faster because a simple MIN() or MAX() only peeks at the index; - # SQLite isn't clever enough to do that for both at the same time, it does a table scan instead) - firstRowID = min(row[0] for row in cursor.execute("SELECT MIN(_ROWID_) FROM `db`.`snp_locus`")) - lastRowID = max(row[0] for row in cursor.execute("SELECT MAX(_ROWID_) FROM `db`.`snp_locus`")) - - # define a callback to store loci that can't be lifted over, for later deletion - def errorCallback(region): - trash.add( (region[0],) ) - - # we can't SELECT and UPDATE the same table at the same time, - # so read in batches of 2.5 million at a time based on _ROWID_ - minRowID = firstRowID - maxRowID = minRowID + 2500000 - 1 - while minRowID <= lastRowID: - sql = "SELECT _ROWID_, chr, pos, NULL FROM `db`.`snp_locus`" - sql += " WHERE (_ROWID_ BETWEEN ? AND ?) AND source_id IN (%s)" % (','.join(str(i) for i in sourceIDs)) - oldLoci = list(cursor.execute(sql, (minRowID,maxRowID))) - newLoci = self._loki.generateLiftOverLoci(oldHG, newHG, oldLoci, tally, errorCallback) - sql = "UPDATE OR REPLACE `db`.`snp_locus` SET chr = ?2, pos = ?3 WHERE _ROWID_ = ?1" - cursor.executemany(sql, newLoci) - numLift += tally['lift'] - numNull += tally['null'] - if trash: - cursor.executemany("DELETE FROM `db`.`snp_locus` WHERE _ROWID_ = ?", trash) - trash.clear() - minRowID = maxRowID + 1 - maxRowID = minRowID + 2500000 - 1 - #foreach batch - - self.log(" OK: %d loci lifted over, %d dropped\n" % (numLift,numNull)) - #liftOverSNPLoci() - - - def liftOverRegions(self, oldHG, newHG, sourceIDs): - self.log("lifting over regions from hg%d to hg%d ..." % (oldHG,newHG)) - self.prepareTableForUpdate('biopolymer_region') - cursor = self._db.cursor() - numLift = numNull = 0 - tally = dict() - trash = set() - - # identify range of _ROWID_ in biopolymer_region - # (two separate queries is faster because a simple MIN() or MAX() only peeks at the index; - # SQLite isn't clever enough to do that for both at the same time, it does a table scan instead) - firstRowID = min(row[0] for row in cursor.execute("SELECT MIN(_ROWID_) FROM `db`.`biopolymer_region`")) - lastRowID = max(row[0] for row in cursor.execute("SELECT MAX(_ROWID_) FROM `db`.`biopolymer_region`")) - - # define a callback to store regions that can't be lifted over, for later deletion - def errorCallback(region): - trash.add( (region[0],) ) - - # we can't SELECT and UPDATE the same table at the same time, - # so read in batches of 2.5 million at a time based on _ROWID_ - # (for regions this will probably be all of them in one go, but just in case) - minRowID = firstRowID - maxRowID = minRowID + 2500000 - 1 - while minRowID <= lastRowID: - sql = "SELECT _ROWID_, chr, posMin, posMax, NULL FROM `db`.`biopolymer_region`" - sql += " WHERE (_ROWID_ BETWEEN ? AND ?) AND source_id IN (%s)" % (','.join(str(i) for i in sourceIDs)) - oldRegions = list(cursor.execute(sql, (minRowID,maxRowID))) - newRegions = self._loki.generateLiftOverRegions(oldHG, newHG, oldRegions, tally, errorCallback) - sql = "UPDATE OR REPLACE `db`.`biopolymer_region` SET chr = ?2, posMin = ?3, posMax = ?4 WHERE _ROWID_ = ?1 AND (1 OR ?5)" - cursor.executemany(sql, newRegions) - numLift += tally['lift'] - numNull += tally['null'] - if trash: - cursor.executemany("DELETE FROM `db`.`biopolymer_region` WHERE _ROWID_ = ?", trash) - trash.clear() - minRowID = maxRowID + 1 - maxRowID = minRowID + 2500000 - 1 - #foreach batch - - self.log(" OK: %d regions lifted over, %d dropped\n" % (numLift,numNull)) - #liftOverRegions() - - - def cleanupSNPMerges(self): - self.log("verifying SNP merge records ...") - self.prepareTableForQuery('snp_merge') - dbc = self._db.cursor() - - # for each set of ROWIDs which constitute a duplicated snp merge, cull all but one - cull = set() - sql = "SELECT GROUP_CONCAT(_ROWID_) FROM `db`.`snp_merge` GROUP BY rsMerged HAVING COUNT() > 1" - #for row in dbc.execute("EXPLAIN QUERY PLAN "+sql): #DEBUG - # print row - for row in dbc.execute(sql): - cull.update( (int(i),) for i in row[0].split(',')[1:] ) - #last = None - #for row in dbc.execute("SELECT _ROWID_, rsMerged FROM `db`.`snp_merge` ORDER BY rsMerged"): - # if last == row[1]: - # cull.add(row[0:1]) - # last = row[1] - if cull: - self.flagTableUpdate('snp_merge') - dbc.executemany("DELETE FROM `db`.`snp_merge` WHERE _ROWID_ = ?", cull) - self.log(" OK: %d duplicate merges\n" % (len(cull),)) - #cleanupSNPMerges() - - - def updateMergedSNPLoci(self): - self.log("checking for merged SNP loci ...") - self.prepareTableForQuery('snp_locus') - self.prepareTableForQuery('snp_merge') - dbc = self._db.cursor() - sql = """ + + ################################################## + # constructor + + def __init__(self, lokidb, is_test=False): + assert isinstance(lokidb, loki_db.Database) + self._is_test = is_test + self._loki = lokidb + self._db = lokidb._db + self._sourceLoaders = {} + self._sourceClasses = dict() + self._sourceObjects = dict() + self._sourceOptions = dict() + self._filehash = dict() + self._updating = False + self._tablesUpdated = set() + self._tablesDeindexed = set() + self.lock = Lock() + + # __init__() + + ################################################## + # logging + + def log(self, message=""): + return self._loki.log(message) + + # log() + + def logPush(self, message=None): + return self._loki.logPush(message) + + # logPush() + + def logPop(self, message=None): + return self._loki.logPop(message) + + # logPop() + + ################################################## + # database update + + def flagTableUpdate(self, table): + self._tablesUpdated.add(table) + + # flagTableUpdate() + + def prepareTableForUpdate(self, table): + if self._updating: + self.flagTableUpdate(table) + if table not in self._tablesDeindexed: + # print "deindexing %s" % table #DEBUG + self._tablesDeindexed.add(table) + self._loki.dropDatabaseIndices(None, "db", table) + + # prepareTableForUpdate() + + def prepareTableForQuery(self, table): + if self._updating: + if table in self._tablesDeindexed: + # print "reindexing %s" % table DEBUG + self._tablesDeindexed.remove(table) + self._loki.createDatabaseIndices(None, "db", table) + + # prepareTableForQuery() + + def findSourceModules(self): + if not self._sourceLoaders: + self._sourceLoaders = {} + loader_path = loaders.__path__ + if self._is_test: + loader_path = [ + os.path.join(loader, "test") for loader in loaders.__path__ + ] + for path in loader_path: + for srcModuleName in os.listdir(path): + if srcModuleName.startswith("loki_source_"): + self._sourceLoaders[srcModuleName[12:-3]] = 1 + + # findSourceModules() + + def getSourceModules(self): + self.findSourceModules() + return self._sourceLoaders.keys() + + # getSourceModules() + + def loadSourceModules(self, sources=None): + self.findSourceModules() + srcSet = set() + for srcName in set(sources) if sources else self._sourceLoaders.keys(): + if srcName not in self._sourceClasses: + if srcName not in self._sourceLoaders: + self.log("WARNING: unknown source '%s'\n" % srcName) + continue + # if module not available + srcModule = importlib.import_module( + "%s.loki_source_%s" % (loaders.__name__, srcName) + ) + srcClass = getattr(srcModule, "Source_%s" % srcName) + if not issubclass(srcClass, loki_source.Source): + self.log("WARNING: invalid module for source '%s'\n" % srcName) + continue + self._sourceClasses[srcName] = srcClass + # if module class not loaded + srcSet.add(srcName) + # foreach source + return srcSet + + # loadSourceModules() + + def getSourceModuleVersions(self, sources=None): + srcSet = self.loadSourceModules(sources) + return { + srcName: self._sourceClasses[srcName].getVersionString() + for srcName in srcSet + } + + # getSourceModuleVersions() + + def getSourceModuleOptions(self, sources=None): + srcSet = self.loadSourceModules(sources) + return { + srcName: self._sourceClasses[srcName].getOptions() for srcName in srcSet + } + + # getSourceModuleOptions() + + def attachSourceModules(self, sources=None): + sources = self.loadSourceModules(sources) + srcSet = set() + for srcName in sources: + if srcName not in self._sourceObjects: + if srcName not in self._sourceClasses: + raise Exception( + "loadSourceModules() reported false positive for '%s'" % srcName + ) + self._sourceObjects[srcName] = self._sourceClasses[srcName](self._loki) + # if module not instantiated + srcSet.add(srcName) + # foreach source + return srcSet + + # attachSourceModules() + + def downloadAndHash(self, iwd, srcName, srcOptions): + srcObj = self._sourceObjects[srcName] + srcID = srcObj.getSourceID() + options = self._sourceOptions[srcName] + + try: + self.log("downloading %s data ...\n" % srcName) + # switch to a temp subdirectory for this source + path = os.path.join(iwd, srcName) + if not os.path.exists(path): + os.makedirs(path) + downloadedFiles = srcObj.download(options, path) + self.log("downloading %s data completed\n" % srcName) + + # calculate source file metadata + # all timestamps are assumed to be in UTC, but if a source + # provides file timestamps with no TZ (like via FTP) we use them + # as-is and assume they're supposed to be UTC + self.log("analyzing %s data files ...\n" % srcName) + for filename in downloadedFiles: + stat = os.stat(filename) + md5 = hashlib.md5() + with open(filename, "rb") as f: + chunk = f.read(8 * 1024 * 1024) + while chunk: + md5.update(chunk) + chunk = f.read(8 * 1024 * 1024) + self.lock.acquire() + self._filehash[filename] = ( + filename, + int(stat.st_size), + int(stat.st_mtime), + md5.hexdigest(), + ) + self.lock.release() + self.log("analyzing %s data files completed\n" % srcName) + except: + self.log("failed loading %s\n" % srcName) + # ToDo: determine how to handle failures + + # downloadAndHash() + + def updateDatabase( + self, sources=None, sourceOptions=None, cacheOnly=False, forceUpdate=False + ): + if self._updating: + raise Exception("_updating set before updateDatabase()") + self._loki.testDatabaseWriteable() + if self._loki.getDatabaseSetting("finalized", int): + raise Exception("cannot update a finalized database") + + # check for extraneous options + self.logPush("preparing for update ...\n") + srcSet = self.attachSourceModules(sources) + srcOpts = sourceOptions or {} + for srcName in srcOpts.keys(): + if srcName not in srcSet: + self.log( + "WARNING: not updating from source '%s' for which options were supplied\n" + % srcName + ) + logIndent = self.logPop("preparing for update completed\n") + + # update all specified sources + iwd = os.path.abspath(os.getcwd()) + self._updating = True + self._tablesUpdated = set() + self._tablesDeindexed = set() + srcErrors = set() + cursor = self._db.cursor() + cursor.execute("SAVEPOINT 'updateDatabase'") + try: + for srcName in sorted(srcSet): + srcObj = self._sourceObjects[srcName] + srcID = srcObj.getSourceID() + + # validate options, if any + prevOptions = dict() + for row in cursor.execute( + "SELECT option, value FROM `db`.`source_option` WHERE source_id = ?", + (srcID,), + ): + prevOptions[str(row[0])] = str(row[1]) + options = srcOpts.get(srcName, prevOptions).copy() + optionsList = sorted(options) + if optionsList: + self.logPush( + "%s %s options ...\n" + % ( + ("validating" if (srcName in srcOpts) else "loading prior"), + srcName, + ) + ) + msg = srcObj.validateOptions(options) + if msg != True: + raise Exception(msg) + if optionsList: + for opt in optionsList: + self.log("%s = %s\n" % (opt, options[opt])) + self.logPop("... OK\n") + + # temp for now but should replace options everywhere below + self._sourceOptions[srcName] = options + + downloadAndHashThreads = {} + srcSetsToDownload = sorted(srcSet) + for srcName in srcSetsToDownload: + # download files into a local cache + if not cacheOnly: + downloadAndHashThreads[srcName] = Thread( + target=self.downloadAndHash, + args=( + iwd, + srcName, + self._sourceOptions[srcName], + ), + ) + downloadAndHashThreads[srcName].start() + + for srcName in downloadAndHashThreads.keys(): + downloadAndHashThreads[srcName].join() + self.log(srcName + " rejoined main thread\n") + + for srcName in srcSetsToDownload: + srcObj = self._sourceObjects[srcName] + srcID = srcObj.getSourceID() + options = self._sourceOptions[srcName] + path = os.path.join(iwd, srcName) + + cursor.execute("SAVEPOINT 'updateDatabase_%s'" % (srcName,)) + + try: + # compare current loader version, options and file metadata to the last update + skip = not forceUpdate + last = "?" + if skip: + for row in cursor.execute( + "SELECT version, DATETIME(updated,'localtime') FROM `db`.`source` WHERE source_id = ?", + (srcID,), + ): + skip = skip and (row[0] == srcObj.getVersionString()) + last = row[1] + if skip: + n = 0 + for row in cursor.execute( + "SELECT option, value FROM `db`.`source_option` WHERE source_id = ?", + (srcID,), + ): + n += 1 + skip = ( + skip + and (row[0] in options) + and (row[1] == options[row[0]]) + ) + skip = skip and (n == len(options)) + if skip: + n = 0 + for row in cursor.execute( + "SELECT filename, size, md5 FROM `db`.`source_file` WHERE source_id = ?", + (srcID,), + ): + n += 1 + skip = ( + skip + and (row[0] in self._filehash) + and (row[1] == self._filehash[row[0]][1]) + and (row[2] == self._filehash[row[0]][3]) + ) + skip = skip and (n == len(self._filehash)) + + # skip the update if the current loader and all source file versions match the last update + if skip: + self.log( + "skipping %s update, no data or software changes since %s\n" + % (srcName, last) + ) + else: + # process new files (or old files with a new loader) + self.logPush("processing %s data ...\n" % srcName) + + cursor.execute( + "DELETE FROM `db`.`warning` WHERE source_id = ?", (srcID,) + ) + srcObj.update(options, path) + cursor.execute( + "UPDATE `db`.`source` SET updated = DATETIME('now'), version = ? WHERE source_id = ?", + (srcObj.getVersionString(), srcID), + ) + + cursor.execute( + "DELETE FROM `db`.`source_option` WHERE source_id = ?", + (srcID,), + ) + sql = ( + "INSERT INTO `db`.`source_option` (source_id, option, value) VALUES (%d,?,?)" + % srcID + ) + cursor.executemany(sql, options.items()) + + cursor.execute( + "DELETE FROM `db`.`source_file` WHERE source_id = ?", + (srcID,), + ) + sql = ( + "INSERT INTO `db`.`source_file` (source_id, filename, size, modified, md5) VALUES (%d,?,?,DATETIME(?,'unixepoch'),?)" + % srcID + ) + cursor.executemany(sql, self._filehash.values()) + + self.logPop("processing %s data completed\n" % srcName) + # if skip + except: + srcErrors.add(srcName) + excType, excVal, excTrace = sys.exc_info() + while self.logPop() > logIndent: + pass + self.logPush("ERROR: failed to update %s\n" % (srcName,)) + if excTrace: + for line in traceback.format_list( + traceback.extract_tb(excTrace)[-1:] + ): + self.log(line) + for line in traceback.format_exception_only(excType, excVal): + self.log(line) + self.logPop() + cursor.execute( + "ROLLBACK TRANSACTION TO SAVEPOINT 'updateDatabase_%s'" + % (srcName,) + ) + finally: + cursor.execute("RELEASE SAVEPOINT 'updateDatabase_%s'" % (srcName,)) + # try/except/finally + + # remove subdirectory to free up some space + shutil.rmtree(path) + # foreach source + + # pull the latest GRCh/UCSChg conversions + # http://genome.ucsc.edu/FAQ/FAQreleases.html + # http://genome.ucsc.edu/goldenPath/releaseLog.html + # TODO: find a better machine-readable source for this data + if not cacheOnly: + self.log("updating GRCh:UCSChg genome build identities ...\n") + import urllib.request as urllib2 + import re + + response = urllib2.urlopen( + "http://genome.ucsc.edu/FAQ/FAQreleases.html" + ) + page = "" + while True: + data = response.read() + if not data: + break + page += data.decode() + rowHuman = False + for tablerow in re.finditer( + r".*?", page, re.IGNORECASE | re.DOTALL + ): + cols = tuple( + match.group()[4:-5].strip().lower() + for match in re.finditer( + r".*?", tablerow.group(), re.IGNORECASE | re.DOTALL + ) + ) + if cols and ( + (cols[0] == "human") + or (rowHuman and (cols[0] in ("", " "))) + ): + rowHuman = True + grch = ucschg = None + try: + if cols[1].startswith("hg"): + ucschg = int(cols[1][2:]) + if cols[3].startswith("genome reference consortium grch"): + grch = int(cols[3][32:]) + if cols[3].startswith("ncbi build "): + grch = int(cols[3][11:]) + except: + pass + if grch and ucschg: + cursor.execute( + "INSERT OR REPLACE INTO `db`.`grch_ucschg` (grch,ucschg) VALUES (?,?)", + (grch, ucschg), + ) + else: + rowHuman = False + # foreach tablerow + self.log("updating GRCh:UCSChg genome build identities completed\n") + # if not cacheOnly + + # cross-map GRCh/UCSChg build versions for all sources + ucscGRC = collections.defaultdict(int) + for row in self._db.cursor().execute( + "SELECT grch,ucschg FROM `db`.`grch_ucschg`" + ): + ucscGRC[row[1]] = max(row[0], ucscGRC[row[1]]) + cursor.execute( + "UPDATE `db`.`source` SET grch = ? WHERE grch IS NULL AND ucschg = ?", + (row[0], row[1]), + ) + cursor.execute( + "UPDATE `db`.`source` SET ucschg = ? WHERE ucschg IS NULL AND grch = ?", + (row[1], row[0]), + ) + cursor.execute( + "UPDATE `db`.`source` SET current_ucschg = ucschg WHERE current_ucschg IS NULL" + ) + + # check for any source with an unrecognized GRCh build + mismatch = False + for row in cursor.execute( + "SELECT source, grch, ucschg FROM `db`.`source` WHERE (grch IS NULL) != (ucschg IS NULL)" + ): + self.log( + "WARNING: unrecognized genome build for '%s' (NCBI GRCh%s, UCSC hg%s)\n" + % (row[0], (row[1] or "?"), (row[2] or "?")) + ) + mismatch = True + if mismatch: + self.log( + "WARNING: database may contain incomparable genome positions!\n" + ) + + # check all sources' UCSChg build versions and set the latest as the target + hgSources = collections.defaultdict(set) + for row in cursor.execute( + "SELECT source_id, current_ucschg FROM `db`.`source` WHERE current_ucschg IS NOT NULL" + ): + hgSources[row[1]].add(row[0]) + if hgSources: + targetHG = max(hgSources) + self.log( + "database genome build: GRCh%s / UCSChg%s\n" + % (ucscGRC.get(targetHG, "?"), targetHG) + ) + targetUpdated = self._loki.getDatabaseSetting("ucschg", int) != targetHG + self._loki.setDatabaseSetting("ucschg", targetHG) + + # liftOver sources with old build versions, if there are any + if len(hgSources) > 1: + locusSources = set( + row[0] + for row in cursor.execute( + "SELECT DISTINCT source_id FROM `db`.`snp_locus`" + ) + ) + regionSources = set( + row[0] + for row in cursor.execute( + "SELECT DISTINCT source_id FROM `db`.`biopolymer_region`" + ) + ) + chainsUpdated = ( + "grch_ucschg" in self._tablesUpdated + or "chain" in self._tablesUpdated + or "chain_data" in self._tablesUpdated + ) + for oldHG in sorted(hgSources): + if oldHG == targetHG: + continue + if not self._loki.hasLiftOverChains(oldHG, targetHG): + self.log( + "ERROR: no chains available to lift hg%d to hg%d\n" + % (oldHG, targetHG) + ) + continue + + if ( + targetUpdated + or chainsUpdated + or "snp_locus" in self._tablesUpdated + ): + sourceIDs = hgSources[oldHG] & locusSources + if sourceIDs: + self.liftOverSNPLoci(oldHG, targetHG, sourceIDs) + if ( + targetUpdated + or chainsUpdated + or "biopolymer_region" in self._tablesUpdated + ): + sourceIDs = hgSources[oldHG] & regionSources + if sourceIDs: + self.liftOverRegions(oldHG, targetHG, sourceIDs) + + sql = ( + "UPDATE `db`.`source` SET current_ucschg = %d WHERE source_id = ?" + % targetHG + ) + cursor.executemany( + sql, ((sourceID,) for sourceID in hgSources[oldHG]) + ) + # foreach old build + # if any old builds + + # post-process as needed + # self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG + if "snp_merge" in self._tablesUpdated: + self.cleanupSNPMerges() + # self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG + if "snp_merge" in self._tablesUpdated or "snp_locus" in self._tablesUpdated: + self.updateMergedSNPLoci() + # self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG + if "snp_locus" in self._tablesUpdated: + self.cleanupSNPLoci() + # self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG + if ( + "snp_merge" in self._tablesUpdated + or "snp_entrez_role" in self._tablesUpdated + ): + self.updateMergedSNPEntrezRoles() + # self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG + if "snp_entrez_role" in self._tablesUpdated: + self.cleanupSNPEntrezRoles() + # self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG + if "snp_merge" in self._tablesUpdated or "gwas" in self._tablesUpdated: + self.updateMergedGWASAnnotations() + # self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG + if ( + "biopolymer_name" in self._tablesUpdated + or "biopolymer_name_name" in self._tablesUpdated + ): + self.resolveBiopolymerNames() + # self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG + if ( + "biopolymer_name" in self._tablesUpdated + or "snp_entrez_role" in self._tablesUpdated + ): + self.resolveSNPBiopolymerRoles() + # self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG + if ( + "biopolymer_name" in self._tablesUpdated + or "group_member_name" in self._tablesUpdated + ): + self.resolveGroupMembers() + # self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG + if "biopolymer_region" in self._tablesUpdated: + self.updateBiopolymerZones() + # self.log("MEMORY: %d bytes (%d peak)\n" % self._loki.getDatabaseMemoryUsage()) #DEBUG + + # reindex all remaining tables + if self._tablesDeindexed: + self._loki.createDatabaseIndices(None, "db", self._tablesDeindexed) + if self._tablesUpdated: + self._loki.setDatabaseSetting("optimized", 0) + self.log("updating database completed\n") + except: + excType, excVal, excTrace = sys.exc_info() + while self.logPop() > logIndent: + pass + self.logPush("ERROR: failed to update the database\n") + if excTrace: + for line in traceback.format_list(traceback.extract_tb(excTrace)[-1:]): + self.log(line) + for line in traceback.format_exception_only(excType, excVal): + self.log(line) + self.logPop() + cursor.execute("ROLLBACK TRANSACTION TO SAVEPOINT 'updateDatabase'") + finally: + cursor.execute("RELEASE SAVEPOINT 'updateDatabase'") + self._updating = False + self._tablesUpdated = set() + self._tablesDeindexed = set() + os.chdir(iwd) + # try/except/finally + + # report and return + if srcErrors: + self.logPush("WARNING: data from these sources was not updated:\n") + for srcName in sorted(srcErrors): + self.log("%s\n" % srcName) + self.logPop() + return False + return True + + # updateDatabase() + + def liftOverSNPLoci(self, oldHG, newHG, sourceIDs): + self.log("lifting over SNP loci from hg%d to hg%d ..." % (oldHG, newHG)) + self.prepareTableForUpdate("snp_locus") + cursor = self._db.cursor() + numLift = numNull = 0 + tally = dict() + trash = set() + + # identify range of _ROWID_ in snp_locus + # (two separate queries is faster because a simple MIN() or MAX() only peeks at the index; + # SQLite isn't clever enough to do that for both at the same time, it does a table scan instead) + firstRowID = min( + row[0] + for row in cursor.execute("SELECT MIN(_ROWID_) FROM `db`.`snp_locus`") + ) + lastRowID = max( + row[0] + for row in cursor.execute("SELECT MAX(_ROWID_) FROM `db`.`snp_locus`") + ) + + # define a callback to store loci that can't be lifted over, for later deletion + def errorCallback(region): + trash.add((region[0],)) + + # we can't SELECT and UPDATE the same table at the same time, + # so read in batches of 2.5 million at a time based on _ROWID_ + minRowID = firstRowID + maxRowID = minRowID + 2500000 - 1 + while minRowID <= lastRowID: + sql = "SELECT _ROWID_, chr, pos, NULL FROM `db`.`snp_locus`" + sql += " WHERE (_ROWID_ BETWEEN ? AND ?) AND source_id IN (%s)" % ( + ",".join(str(i) for i in sourceIDs) + ) + oldLoci = list(cursor.execute(sql, (minRowID, maxRowID))) + newLoci = self._loki.generateLiftOverLoci( + oldHG, newHG, oldLoci, tally, errorCallback + ) + sql = "UPDATE OR REPLACE `db`.`snp_locus` SET chr = ?2, pos = ?3 WHERE _ROWID_ = ?1" + cursor.executemany(sql, newLoci) + numLift += tally["lift"] + numNull += tally["null"] + if trash: + cursor.executemany( + "DELETE FROM `db`.`snp_locus` WHERE _ROWID_ = ?", trash + ) + trash.clear() + minRowID = maxRowID + 1 + maxRowID = minRowID + 2500000 - 1 + # foreach batch + + self.log(" OK: %d loci lifted over, %d dropped\n" % (numLift, numNull)) + + # liftOverSNPLoci() + + def liftOverRegions(self, oldHG, newHG, sourceIDs): + self.log("lifting over regions from hg%d to hg%d ..." % (oldHG, newHG)) + self.prepareTableForUpdate("biopolymer_region") + cursor = self._db.cursor() + numLift = numNull = 0 + tally = dict() + trash = set() + + # identify range of _ROWID_ in biopolymer_region + # (two separate queries is faster because a simple MIN() or MAX() only peeks at the index; + # SQLite isn't clever enough to do that for both at the same time, it does a table scan instead) + firstRowID = min( + row[0] + for row in cursor.execute( + "SELECT MIN(_ROWID_) FROM `db`.`biopolymer_region`" + ) + ) + lastRowID = max( + row[0] + for row in cursor.execute( + "SELECT MAX(_ROWID_) FROM `db`.`biopolymer_region`" + ) + ) + + # define a callback to store regions that can't be lifted over, for later deletion + def errorCallback(region): + trash.add((region[0],)) + + # we can't SELECT and UPDATE the same table at the same time, + # so read in batches of 2.5 million at a time based on _ROWID_ + # (for regions this will probably be all of them in one go, but just in case) + minRowID = firstRowID + maxRowID = minRowID + 2500000 - 1 + while minRowID <= lastRowID: + sql = "SELECT _ROWID_, chr, posMin, posMax, NULL FROM `db`.`biopolymer_region`" + sql += " WHERE (_ROWID_ BETWEEN ? AND ?) AND source_id IN (%s)" % ( + ",".join(str(i) for i in sourceIDs) + ) + oldRegions = list(cursor.execute(sql, (minRowID, maxRowID))) + newRegions = self._loki.generateLiftOverRegions( + oldHG, newHG, oldRegions, tally, errorCallback + ) + sql = "UPDATE OR REPLACE `db`.`biopolymer_region` SET chr = ?2, posMin = ?3, posMax = ?4 WHERE _ROWID_ = ?1 AND (1 OR ?5)" + cursor.executemany(sql, newRegions) + numLift += tally["lift"] + numNull += tally["null"] + if trash: + cursor.executemany( + "DELETE FROM `db`.`biopolymer_region` WHERE _ROWID_ = ?", trash + ) + trash.clear() + minRowID = maxRowID + 1 + maxRowID = minRowID + 2500000 - 1 + # foreach batch + + self.log(" OK: %d regions lifted over, %d dropped\n" % (numLift, numNull)) + + # liftOverRegions() + + def cleanupSNPMerges(self): + self.log("verifying SNP merge records ...") + self.prepareTableForQuery("snp_merge") + dbc = self._db.cursor() + + # for each set of ROWIDs which constitute a duplicated snp merge, cull all but one + cull = set() + sql = "SELECT GROUP_CONCAT(_ROWID_) FROM `db`.`snp_merge` GROUP BY rsMerged HAVING COUNT() > 1" + # for row in dbc.execute("EXPLAIN QUERY PLAN "+sql): #DEBUG + # print row + for row in dbc.execute(sql): + cull.update((int(i),) for i in row[0].split(",")[1:]) + # last = None + # for row in dbc.execute("SELECT _ROWID_, rsMerged FROM `db`.`snp_merge` ORDER BY rsMerged"): + # if last == row[1]: + # cull.add(row[0:1]) + # last = row[1] + if cull: + self.flagTableUpdate("snp_merge") + dbc.executemany("DELETE FROM `db`.`snp_merge` WHERE _ROWID_ = ?", cull) + self.log(" OK: %d duplicate merges\n" % (len(cull),)) + + # cleanupSNPMerges() + + def updateMergedSNPLoci(self): + self.log("checking for merged SNP loci ...") + self.prepareTableForQuery("snp_locus") + self.prepareTableForQuery("snp_merge") + dbc = self._db.cursor() + sql = """ INSERT INTO `db`.`snp_locus` (rs, chr, pos, validated, source_id) SELECT sm.rsCurrent, sl.chr, sl.pos, sl.validated, sl.source_id FROM `db`.`snp_locus` AS sl JOIN `db`.`snp_merge` AS sm ON sm.rsMerged = sl.rs """ - #for row in dbc.execute("EXPLAIN QUERY PLAN "+sql): #DEBUG - # print row - dbc.execute(sql) - numCopied = self._db.changes() - if numCopied: - self.flagTableUpdate('snp_locus') - self.log(" OK: %d loci copied\n" % (numCopied,)) - #updateMergedSNPLoci() - - - def cleanupSNPLoci(self): - self.log("verifying SNP loci ...") - self.prepareTableForQuery('snp_locus') - dbc = self._db.cursor() - # for each set of ROWIDs which constitute a duplicated snp-locus, cull all but one - # but, make sure that if any of the originals were validated, the remaining one is also - valid = set() - cull = set() - sql = "SELECT GROUP_CONCAT(_ROWID_), MAX(validated) FROM `db`.`snp_locus` GROUP BY rs, chr, pos HAVING COUNT() > 1" - #for row in dbc.execute("EXPLAIN QUERY PLAN "+sql): #DEBUG - # print row - for row in dbc.execute(sql): - rowids = row[0].split(',') - if row[1]: - valid.add( (int(rowids[0]),) ) - cull.update( (int(i),) for i in rowids[1:] ) - #last = None - #for row in dbc.execute("SELECT _ROWID_, rs||':'||chr||':'||pos, validated FROM `db`.`snp_locus` ORDER BY rs, chr, pos"): - # if last == row[1]: - # cull.add(row[0:1]) - # if row[2]: - # valid.add( last.split(':') ) - # last = row[1] - if valid: - dbc.executemany("UPDATE `db`.`snp_locus` SET validated = 1 WHERE _ROWID_ = ?", valid) - #dbc.executemany("UPDATE `db`.`snp_locus` SET validated = 1 WHERE rs = ? AND chr = ? AND pos = ?", valid) - if cull: - self.flagTableUpdate('snp_locus') - dbc.executemany("DELETE FROM `db`.`snp_locus` WHERE _ROWID_ = ?", cull) - self.log(" OK: %d duplicate loci\n" % (len(cull),)) - #cleanupSNPLoci() - - - def updateMergedSNPEntrezRoles(self): - self.log("checking for merged SNP roles ...") - self.prepareTableForQuery('snp_entrez_role') - self.prepareTableForQuery('snp_merge') - dbc = self._db.cursor() - sql = """ + # for row in dbc.execute("EXPLAIN QUERY PLAN "+sql): #DEBUG + # print row + dbc.execute(sql) + numCopied = self._db.changes() + if numCopied: + self.flagTableUpdate("snp_locus") + self.log(" OK: %d loci copied\n" % (numCopied,)) + + # updateMergedSNPLoci() + + def cleanupSNPLoci(self): + self.log("verifying SNP loci ...") + self.prepareTableForQuery("snp_locus") + dbc = self._db.cursor() + # for each set of ROWIDs which constitute a duplicated snp-locus, cull all but one + # but, make sure that if any of the originals were validated, the remaining one is also + valid = set() + cull = set() + sql = "SELECT GROUP_CONCAT(_ROWID_), MAX(validated) FROM `db`.`snp_locus` GROUP BY rs, chr, pos HAVING COUNT() > 1" + # for row in dbc.execute("EXPLAIN QUERY PLAN "+sql): #DEBUG + # print row + for row in dbc.execute(sql): + rowids = row[0].split(",") + if row[1]: + valid.add((int(rowids[0]),)) + cull.update((int(i),) for i in rowids[1:]) + # last = None + # for row in dbc.execute("SELECT _ROWID_, rs||':'||chr||':'||pos, validated FROM `db`.`snp_locus` ORDER BY rs, chr, pos"): + # if last == row[1]: + # cull.add(row[0:1]) + # if row[2]: + # valid.add( last.split(':') ) + # last = row[1] + if valid: + dbc.executemany( + "UPDATE `db`.`snp_locus` SET validated = 1 WHERE _ROWID_ = ?", valid + ) + # dbc.executemany("UPDATE `db`.`snp_locus` SET validated = 1 WHERE rs = ? AND chr = ? AND pos = ?", valid) + if cull: + self.flagTableUpdate("snp_locus") + dbc.executemany("DELETE FROM `db`.`snp_locus` WHERE _ROWID_ = ?", cull) + self.log(" OK: %d duplicate loci\n" % (len(cull),)) + + # cleanupSNPLoci() + + def updateMergedSNPEntrezRoles(self): + self.log("checking for merged SNP roles ...") + self.prepareTableForQuery("snp_entrez_role") + self.prepareTableForQuery("snp_merge") + dbc = self._db.cursor() + sql = """ INSERT OR IGNORE INTO `db`.`snp_entrez_role` (rs, entrez_id, role_id, source_id) SELECT sm.rsCurrent, ser.entrez_id, ser.role_id, ser.source_id FROM `db`.`snp_entrez_role` AS ser JOIN `db`.`snp_merge` AS sm ON sm.rsMerged = ser.rs """ - #for row in dbc.execute("EXPLAIN QUERY PLAN "+sql): #DEBUG - # print row - dbc.execute(sql) - numCopied = self._db.changes() - if numCopied: - self.flagTableUpdate('snp_entrez_role') - self.log(" OK: %d roles copied\n" % (numCopied,)) - #updateMergedSNPEntrezRoles() - - - def cleanupSNPEntrezRoles(self): - self.log("verifying SNP roles ...") - self.prepareTableForQuery('snp_entrez_role') - dbc = self._db.cursor() - cull = set() - sql = "SELECT GROUP_CONCAT(_ROWID_) FROM `db`.`snp_entrez_role` GROUP BY rs, entrez_id, role_id HAVING COUNT() > 1" - #for row in dbc.execute("EXPLAIN QUERY PLAN "+sql): #DEBUG - # print row - for row in dbc.execute(sql): - cull.update( (int(i),) for i in row[0].split(',')[1:] ) - #last = None - #for row in dbc.execute("SELECT _ROWID_, rs||':'||entrez_id||':'||role_id FROM `db`.`snp_entrez_role` ORDER BY rs, entrez_id, role_id"): - # if last == row[1]: - # cull.add(row[0:1]) - # last = row[1] - if cull: - self.flagTableUpdate('snp_entrez_role') - dbc.executemany("DELETE FROM `db`.`snp_entrez_role` WHERE _ROWID_ = ?", cull) - self.log(" OK: %d duplicate roles\n" % (len(cull),)) - #cleanupSNPEntrezRoles() - - - def updateMergedGWASAnnotations(self): - self.log("checking for merged GWAS annotated SNPs ...") - self.prepareTableForQuery('gwas') - self.prepareTableForQuery('snp_merge') - dbc = self._db.cursor() - sql = """ + # for row in dbc.execute("EXPLAIN QUERY PLAN "+sql): #DEBUG + # print row + dbc.execute(sql) + numCopied = self._db.changes() + if numCopied: + self.flagTableUpdate("snp_entrez_role") + self.log(" OK: %d roles copied\n" % (numCopied,)) + + # updateMergedSNPEntrezRoles() + + def cleanupSNPEntrezRoles(self): + self.log("verifying SNP roles ...") + self.prepareTableForQuery("snp_entrez_role") + dbc = self._db.cursor() + cull = set() + sql = "SELECT GROUP_CONCAT(_ROWID_) FROM `db`.`snp_entrez_role` GROUP BY rs, entrez_id, role_id HAVING COUNT() > 1" + # for row in dbc.execute("EXPLAIN QUERY PLAN "+sql): #DEBUG + # print row + for row in dbc.execute(sql): + cull.update((int(i),) for i in row[0].split(",")[1:]) + # last = None + # for row in dbc.execute("SELECT _ROWID_, rs||':'||entrez_id||':'||role_id FROM `db`.`snp_entrez_role` ORDER BY rs, entrez_id, role_id"): + # if last == row[1]: + # cull.add(row[0:1]) + # last = row[1] + if cull: + self.flagTableUpdate("snp_entrez_role") + dbc.executemany( + "DELETE FROM `db`.`snp_entrez_role` WHERE _ROWID_ = ?", cull + ) + self.log(" OK: %d duplicate roles\n" % (len(cull),)) + + # cleanupSNPEntrezRoles() + + def updateMergedGWASAnnotations(self): + self.log("checking for merged GWAS annotated SNPs ...") + self.prepareTableForQuery("gwas") + self.prepareTableForQuery("snp_merge") + dbc = self._db.cursor() + sql = """ INSERT INTO `db`.`gwas` (rs, chr, pos, trait, snps, orbeta, allele95ci, riskAfreq, pubmed_id, source_id) SELECT sm.rsCurrent, w.chr, w.pos, w.trait, w.snps, w.orbeta, w.allele95ci, w.riskAfreq, w.pubmed_id, w.source_id FROM `db`.`gwas` AS w JOIN `db`.`snp_merge` AS sm ON sm.rsMerged = w.rs """ - #for row in dbc.execute("EXPLAIN QUERY PLAN "+sql): #DEBUG - # print row - dbc.execute(sql) - numCopied = self._db.changes() - if numCopied: - self.flagTableUpdate('gwas') - self.log(" OK: %d annotations copied\n" % (numCopied,)) - #updateMergedGWASAnnotations() - - - def resolveBiopolymerNames(self): - #TODO: iterative? - self.log("resolving biopolymer names ...") - dbc = self._db.cursor() - - # calculate confidence scores for each possible name match - dbc.execute(""" + # for row in dbc.execute("EXPLAIN QUERY PLAN "+sql): #DEBUG + # print row + dbc.execute(sql) + numCopied = self._db.changes() + if numCopied: + self.flagTableUpdate("gwas") + self.log(" OK: %d annotations copied\n" % (numCopied,)) + + # updateMergedGWASAnnotations() + + def resolveBiopolymerNames(self): + # TODO: iterative? + self.log("resolving biopolymer names ...") + dbc = self._db.cursor() + + # calculate confidence scores for each possible name match + dbc.execute( + """ CREATE TEMP TABLE `temp`.`_biopolymer_name_name_score` ( new_namespace_id INTERGER NOT NULL, new_name VARCHAR(256) NOT NULL, @@ -726,12 +916,14 @@ def resolveBiopolymerNames(self): implication INTEGER NOT NULL, PRIMARY KEY (new_namespace_id, new_name, biopolymer_id) ) -""") - self.prepareTableForQuery('biopolymer_name_name') - self.prepareTableForQuery('biopolymer_name') - self.prepareTableForQuery('biopolymer') - self.prepareTableForQuery('namespace') - dbc.execute(""" +""" + ) + self.prepareTableForQuery("biopolymer_name_name") + self.prepareTableForQuery("biopolymer_name") + self.prepareTableForQuery("biopolymer") + self.prepareTableForQuery("namespace") + dbc.execute( + """ INSERT INTO `temp`.`_biopolymer_name_name_score` (new_namespace_id, new_name, biopolymer_id, polygenic, implication) /* calculate implication score for each possible match for each name */ SELECT @@ -748,12 +940,14 @@ def resolveBiopolymerNames(self): WHERE bnn.namespace_id IN (0, bn.namespace_id) AND bnn.type_id IN (0, b.type_id) GROUP BY bnn.new_namespace_id, bnn.new_name, bn.biopolymer_id -""") - - # extrapolate new biopolymer_name records - self.prepareTableForUpdate('biopolymer_name') - dbc.execute("DELETE FROM `db`.`biopolymer_name` WHERE source_id = 0") - dbc.execute(""" +""" + ) + + # extrapolate new biopolymer_name records + self.prepareTableForUpdate("biopolymer_name") + dbc.execute("DELETE FROM `db`.`biopolymer_name` WHERE source_id = 0") + dbc.execute( + """ INSERT OR IGNORE INTO `db`.`biopolymer_name` (biopolymer_id, namespace_id, name, source_id) /* identify specific match with the best score for each name */ SELECT @@ -783,16 +977,18 @@ def resolveBiopolymerNames(self): ) JOIN `temp`.`_biopolymer_name_name_score` USING (new_namespace_id, new_name) WHERE polygenic > 0 OR implication >= name_implication -""") - - # clean up - dbc.execute("DROP TABLE `temp`.`_biopolymer_name_name_score`") - numTotal = numUnrec = numMatch = 0 - self.prepareTableForQuery('biopolymer_name_name') - self.prepareTableForQuery('biopolymer_name') - self.prepareTableForQuery('biopolymer') - numTotal = numUnrec = numMatch = 0 - for row in dbc.execute(""" +""" + ) + + # clean up + dbc.execute("DROP TABLE `temp`.`_biopolymer_name_name_score`") + numTotal = numUnrec = numMatch = 0 + self.prepareTableForQuery("biopolymer_name_name") + self.prepareTableForQuery("biopolymer_name") + self.prepareTableForQuery("biopolymer") + numTotal = numUnrec = numMatch = 0 + for row in dbc.execute( + """ SELECT COUNT(), SUM(CASE WHEN matches < 1 THEN 1 ELSE 0 END) FROM ( SELECT COUNT(DISTINCT b.biopolymer_id) AS matches @@ -805,10 +1001,12 @@ def resolveBiopolymerNames(self): AND bnn.type_id IN (0, b.type_id) GROUP BY bnn.new_namespace_id, bnn.new_name ) -"""): - numTotal = row[0] or 0 - numUnrec = row[1] or 0 - for row in dbc.execute(""" +""" + ): + numTotal = row[0] or 0 + numUnrec = row[1] or 0 + for row in dbc.execute( + """ SELECT COUNT() FROM ( SELECT 1 @@ -816,28 +1014,33 @@ def resolveBiopolymerNames(self): WHERE source_id = 0 GROUP BY namespace_id, name ) -"""): - numMatch = row[0] or 0 - numAmbig = numTotal - numUnrec - numMatch - self.log("resolving biopolymer names completed: %d identifiers (%d ambiguous, %d unrecognized)\n" % (numMatch,numAmbig,numUnrec)) - #resolveBiopolymerNames() - - - def resolveSNPBiopolymerRoles(self): - self.log("resolving SNP roles ...\n") - dbc = self._db.cursor() - - typeID = self._loki.getTypeID('gene') - namespaceID = self._loki.getNamespaceID('entrez_gid') - numUnrec = 0 - if typeID and namespaceID: - self.prepareTableForUpdate('snp_biopolymer_role') - self.prepareTableForQuery('snp_entrez_role') - self.prepareTableForQuery('biopolymer_name') - dbc.execute("DELETE FROM `db`.`snp_biopolymer_role`") - # we have to convert entrez_id to a string because the optimizer - # won't use the index on biopolymer_name.name if the types don't match - dbc.execute(""" +""" + ): + numMatch = row[0] or 0 + numAmbig = numTotal - numUnrec - numMatch + self.log( + "resolving biopolymer names completed: %d identifiers (%d ambiguous, %d unrecognized)\n" + % (numMatch, numAmbig, numUnrec) + ) + + # resolveBiopolymerNames() + + def resolveSNPBiopolymerRoles(self): + self.log("resolving SNP roles ...\n") + dbc = self._db.cursor() + + typeID = self._loki.getTypeID("gene") + namespaceID = self._loki.getNamespaceID("entrez_gid") + numUnrec = 0 + if typeID and namespaceID: + self.prepareTableForUpdate("snp_biopolymer_role") + self.prepareTableForQuery("snp_entrez_role") + self.prepareTableForQuery("biopolymer_name") + dbc.execute("DELETE FROM `db`.`snp_biopolymer_role`") + # we have to convert entrez_id to a string because the optimizer + # won't use the index on biopolymer_name.name if the types don't match + dbc.execute( + """ INSERT INTO `db`.`snp_biopolymer_role` (rs, biopolymer_id, role_id, source_id) SELECT ser.rs, bn.biopolymer_id, ser.role_id, ser.source_id FROM `db`.`snp_entrez_role` AS ser @@ -845,8 +1048,13 @@ def resolveSNPBiopolymerRoles(self): ON bn.namespace_id = ? AND bn.name = ''||ser.entrez_id JOIN `db`.`biopolymer` AS b ON b.biopolymer_id = bn.biopolymer_id AND b.type_id = ? -""", (namespaceID,typeID)) - numUnrec = sum(row[0] for row in dbc.execute(""" +""", + (namespaceID, typeID), + ) + numUnrec = sum( + row[0] + for row in dbc.execute( + """ SELECT COUNT() FROM ( SELECT 1 FROM `db`.`snp_entrez_role` AS ser @@ -857,40 +1065,51 @@ def resolveSNPBiopolymerRoles(self): GROUP BY ser._ROWID_ HAVING MAX(b.biopolymer_id) IS NULL ) -""", (namespaceID,typeID))) - #if type[gene] and namespace[entrez_gid] - - self.prepareTableForQuery('snp_biopolymer_role') - cull = set() - sql = "SELECT GROUP_CONCAT(_ROWID_) FROM `db`.`snp_biopolymer_role` GROUP BY rs, biopolymer_id, role_id HAVING COUNT() > 1" - #for row in dbc.execute("EXPLAIN QUERY PLAN "+sql): #DEBUG - # print row - for row in dbc.execute(sql): - cull.update( (int(i),) for i in row[0].split(',')[1:] ) - #last = None - #for row in dbc.execute("SELECT _ROWID_, rs||':'||biopolymer_id||':'||role_id FROM `db`.`snp_biopolymer_role` ORDER BY rs, biopolymer_id, role_id"): - # if last == row[1]: - # cull.add(row[0:1]) - # last = row[1] - if cull: - self.flagTableUpdate('snp_biopolymer_role') - dbc.executemany("DELETE FROM `db`.`snp_biopolymer_role` WHERE _ROWID_ = ?", cull) - - numTotal = numSNPs = numGenes = 0 - for row in dbc.execute("SELECT COUNT(), COUNT(DISTINCT rs), COUNT(DISTINCT biopolymer_id) FROM `db`.`snp_biopolymer_role`"): - numTotal = row[0] - numSNPs = row[1] - numGenes = row[2] - self.log("resolving SNP roles completed: %d roles (%d SNPs, %d genes; %d unrecognized)\n" % (numTotal,numSNPs,numGenes,numUnrec)) - #resolveSNPBiopolymerRoles() - - - def resolveGroupMembers(self): - self.log("resolving group members ...\n") - dbc = self._db.cursor() - - # calculate confidence scores for each possible name match - dbc.execute(""" +""", + (namespaceID, typeID), + ) + ) + # if type[gene] and namespace[entrez_gid] + + self.prepareTableForQuery("snp_biopolymer_role") + cull = set() + sql = "SELECT GROUP_CONCAT(_ROWID_) FROM `db`.`snp_biopolymer_role` GROUP BY rs, biopolymer_id, role_id HAVING COUNT() > 1" + # for row in dbc.execute("EXPLAIN QUERY PLAN "+sql): #DEBUG + # print row + for row in dbc.execute(sql): + cull.update((int(i),) for i in row[0].split(",")[1:]) + # last = None + # for row in dbc.execute("SELECT _ROWID_, rs||':'||biopolymer_id||':'||role_id FROM `db`.`snp_biopolymer_role` ORDER BY rs, biopolymer_id, role_id"): + # if last == row[1]: + # cull.add(row[0:1]) + # last = row[1] + if cull: + self.flagTableUpdate("snp_biopolymer_role") + dbc.executemany( + "DELETE FROM `db`.`snp_biopolymer_role` WHERE _ROWID_ = ?", cull + ) + + numTotal = numSNPs = numGenes = 0 + for row in dbc.execute( + "SELECT COUNT(), COUNT(DISTINCT rs), COUNT(DISTINCT biopolymer_id) FROM `db`.`snp_biopolymer_role`" + ): + numTotal = row[0] + numSNPs = row[1] + numGenes = row[2] + self.log( + "resolving SNP roles completed: %d roles (%d SNPs, %d genes; %d unrecognized)\n" + % (numTotal, numSNPs, numGenes, numUnrec) + ) + + # resolveSNPBiopolymerRoles() + + def resolveGroupMembers(self): + self.log("resolving group members ...\n") + dbc = self._db.cursor() + + # calculate confidence scores for each possible name match + dbc.execute( + """ CREATE TEMP TABLE `temp`.`_group_member_name_score` ( group_id INTERGER NOT NULL, member INTEGER NOT NULL, @@ -899,12 +1118,14 @@ def resolveGroupMembers(self): implication INTEGER NOT NULL, quality INTEGER NOT NULL ) -""") - self.prepareTableForQuery('group_member_name') - self.prepareTableForQuery('biopolymer_name') - self.prepareTableForQuery('biopolymer') - self.prepareTableForQuery('namespace') - dbc.execute(""" +""" + ) + self.prepareTableForQuery("group_member_name") + self.prepareTableForQuery("biopolymer_name") + self.prepareTableForQuery("biopolymer") + self.prepareTableForQuery("namespace") + dbc.execute( + """ INSERT INTO `temp`.`_group_member_name_score` (group_id, member, biopolymer_id, polynames, implication, quality) /* calculate implication and quality scores for each possible match for each member */ SELECT @@ -955,13 +1176,17 @@ def resolveGroupMembers(self): WHERE gmn.namespace_id IN (0, bn.namespace_id) AND gmn.type_id IN (0, b.type_id) GROUP BY group_id, member, biopolymer_id -""") - dbc.execute("CREATE INDEX `temp`.`_group_member_name_score__group_member_biopolymer` ON `_group_member_name_score` (group_id, member, biopolymer_id)") - - # generate group_biopolymer assignments with confidence scores - self.prepareTableForUpdate('group_biopolymer') - dbc.execute("DELETE FROM `db`.`group_biopolymer` WHERE source_id = 0") - dbc.execute(""" +""" + ) + dbc.execute( + "CREATE INDEX `temp`.`_group_member_name_score__group_member_biopolymer` ON `_group_member_name_score` (group_id, member, biopolymer_id)" + ) + + # generate group_biopolymer assignments with confidence scores + self.prepareTableForUpdate("group_biopolymer") + dbc.execute("DELETE FROM `db`.`group_biopolymer` WHERE source_id = 0") + dbc.execute( + """ /* group-biopolymer assignments with confidence scores */ INSERT INTO `db`.`group_biopolymer` (group_id, biopolymer_id, specificity, implication, quality, source_id) SELECT @@ -1021,14 +1246,16 @@ def resolveGroupMembers(self): GROUP BY group_id, member, biopolymer_id ) GROUP BY group_id, biopolymer_id -""") - - # generate group_biopolymer placeholders for unrecognized members - self.prepareTableForUpdate('group_biopolymer') - self.prepareTableForQuery('group_member_name') - self.prepareTableForQuery('biopolymer_name') - self.prepareTableForQuery('biopolymer') - dbc.execute(""" +""" + ) + + # generate group_biopolymer placeholders for unrecognized members + self.prepareTableForUpdate("group_biopolymer") + self.prepareTableForQuery("group_member_name") + self.prepareTableForQuery("biopolymer_name") + self.prepareTableForQuery("biopolymer") + dbc.execute( + """ INSERT INTO `db`.`group_biopolymer` (group_id, biopolymer_id, specificity, implication, quality, source_id) SELECT group_id, @@ -1050,13 +1277,15 @@ def resolveGroupMembers(self): HAVING MAX(b.biopolymer_id) IS NULL ) GROUP BY group_id -""") - - # clean up - dbc.execute("DROP TABLE `temp`.`_group_member_name_score`") - numTotal = numSourced = numMatch = numAmbig = numUnrec = 0 - self.prepareTableForQuery('group_biopolymer') - for row in dbc.execute(""" +""" + ) + + # clean up + dbc.execute("DROP TABLE `temp`.`_group_member_name_score`") + numTotal = numSourced = numMatch = numAmbig = numUnrec = 0 + self.prepareTableForQuery("group_biopolymer") + for row in dbc.execute( + """ SELECT COALESCE(SUM(CASE WHEN biopolymer_id > 0 THEN 1 ELSE 0 END),0) AS total, COALESCE(SUM(CASE WHEN biopolymer_id > 0 AND source_id > 0 THEN 1 ELSE 0 END),0) AS sourced, @@ -1064,54 +1293,69 @@ def resolveGroupMembers(self): COALESCE(SUM(CASE WHEN biopolymer_id > 0 AND source_id = 0 AND (specificity < 100 OR implication < 100 OR quality < 100) THEN 1 ELSE 0 END),0) AS conditional, COALESCE(SUM(CASE WHEN biopolymer_id = 0 AND source_id = 0 THEN specificity ELSE 0 END),0) AS unmatched FROM `db`.`group_biopolymer` -"""): - numTotal = row[0] - numSourced = row[1] - numMatch = row[2] - numAmbig = row[3] - numUnrec = row[4] - self.log("resolving group members completed: %d associations (%d explicit, %d definite, %d conditional, %d unrecognized)\n" % (numTotal,numSourced,numMatch,numAmbig,numUnrec)) - #resolveGroupMembers() - - - def updateBiopolymerZones(self): - self.log("calculating zone coverage ...") - size = self._loki.getDatabaseSetting('zone_size',int) - if not size: - raise Exception("ERROR: could not determine database setting 'zone_size'") - dbc = self._db.cursor() - - # make sure all regions are correctly oriented - dbc.execute("UPDATE `db`.`biopolymer_region` SET posMin = posMax, posMax = posMin WHERE posMin > posMax") - - # define zone generator - def _zones(size, regions): - # regions=[ (id,chr,posMin,posMax),... ] - # yields:[ (id,chr,zone),... ] - for r in regions: - for z in range(int(r[2]/size),int(r[3]/size)+1): - yield (r[0],r[1],z) - #_zones() - - # feed all regions through the zone generator - self.prepareTableForUpdate('biopolymer_zone') - self.prepareTableForQuery('biopolymer_region') - dbc.execute("DELETE FROM `db`.`biopolymer_zone`") - dbc.executemany( - "INSERT OR IGNORE INTO `db`.`biopolymer_zone` (biopolymer_id,chr,zone) VALUES (?,?,?)", - _zones( - size, - self._db.cursor().execute("SELECT biopolymer_id,chr,MIN(posMin),MAX(posMax) FROM `db`.`biopolymer_region` GROUP BY biopolymer_id, chr") - ) - ) - - # clean up - self.prepareTableForQuery('biopolymer_zone') - for row in dbc.execute("SELECT COUNT(), COUNT(DISTINCT biopolymer_id) FROM `db`.`biopolymer_zone`"): - numTotal = row[0] - numGenes = row[1] - self.log("calculating zone coverage completed: %d records (%d regions)\n" % (numTotal,numGenes)) - #updateBiopolymerZones() - - -#Updater +""" + ): + numTotal = row[0] + numSourced = row[1] + numMatch = row[2] + numAmbig = row[3] + numUnrec = row[4] + self.log( + "resolving group members completed: %d associations (%d explicit, %d definite, %d conditional, %d unrecognized)\n" + % (numTotal, numSourced, numMatch, numAmbig, numUnrec) + ) + + # resolveGroupMembers() + + def updateBiopolymerZones(self): + self.log("calculating zone coverage ...") + size = self._loki.getDatabaseSetting("zone_size", int) + if not size: + raise Exception("ERROR: could not determine database setting 'zone_size'") + dbc = self._db.cursor() + + # make sure all regions are correctly oriented + dbc.execute( + "UPDATE `db`.`biopolymer_region` SET posMin = posMax, posMax = posMin WHERE posMin > posMax" + ) + + # define zone generator + def _zones(size, regions): + # regions=[ (id,chr,posMin,posMax),... ] + # yields:[ (id,chr,zone),... ] + for r in regions: + for z in range(int(r[2] / size), int(r[3] / size) + 1): + yield (r[0], r[1], z) + + # _zones() + + # feed all regions through the zone generator + self.prepareTableForUpdate("biopolymer_zone") + self.prepareTableForQuery("biopolymer_region") + dbc.execute("DELETE FROM `db`.`biopolymer_zone`") + dbc.executemany( + "INSERT OR IGNORE INTO `db`.`biopolymer_zone` (biopolymer_id,chr,zone) VALUES (?,?,?)", + _zones( + size, + self._db.cursor().execute( + "SELECT biopolymer_id,chr,MIN(posMin),MAX(posMax) FROM `db`.`biopolymer_region` GROUP BY biopolymer_id, chr" + ), + ), + ) + + # clean up + self.prepareTableForQuery("biopolymer_zone") + for row in dbc.execute( + "SELECT COUNT(), COUNT(DISTINCT biopolymer_id) FROM `db`.`biopolymer_zone`" + ): + numTotal = row[0] + numGenes = row[1] + self.log( + "calculating zone coverage completed: %d records (%d regions)\n" + % (numTotal, numGenes) + ) + + # updateBiopolymerZones() + + +# Updater diff --git a/loki/unsupported_loaders/loki_source_disgenet.py b/loki/unsupported_loaders/loki_source_disgenet.py index 54e943b..344be50 100644 --- a/loki/unsupported_loaders/loki_source_disgenet.py +++ b/loki/unsupported_loaders/loki_source_disgenet.py @@ -8,94 +8,123 @@ class Source_disgenet(loki_source.Source): - - - @classmethod - def getVersionString(cls): - return '1.0 (2023-08-08)' - #getVersionString() - - - def download(self, options): - # download the latest source files - self.downloadFilesFromHTTP('disgenet.org', { - 'disgenet_2020.db.gz': '/static/disgenet_ap1/files/sqlite_downloads/current/disgenet_2020.db.gz', - }) - #download() - - - def update(self, options): - # clear out all old data from this source - self.log("deleting old records from the database ...") - self.deleteAll() - self.log(" OK\n") - - # get or create the required metadata records - namespaceID = self.addNamespaces([ - ('disgenet_id', 0), - ('entrez_gid', 0), - ('disease', 0) - ]) - typeID = self.addTypes([ - ('disease',), - ('gene',), - ]) - subtypeID = self.addSubtypes([ - ('-',), - ]) - - # process disgenet sqlite file - self.log("processing diseases ...") - gunzip('disgenet_2020.db.gz') - diseases = {} - diseaseClass = {} - con = apsw.Connection('disgenet_2020.db') - cur = con.cursor() - comm = 'select diseaseClassNID,diseaseClassName from diseaseClass' - cur.execute(comm) - diseaseClass = {diseaseclass[0]:diseaseclass[1].strip() for diseaseclass in cur.fetchall()} - comm = 'SELECT a.diseaseId,a.diseaseName,b.diseaseClassNID FROM diseaseAttributes a LEFT JOIN disease2class b ON a.diseaseNID=b.diseaseNID order by a.diseaseNID' - cur.execute(comm) - diseases = {disease[0]:[disease[1],disease[2]] for disease in cur.fetchall()} - #foreach line in diseaseFile - self.log(" OK: %d disease\n" % (len(diseases),)) - - # store diseases - self.log("writing diseases to the database ...") - listSubtype = self.addSubtypes([(val,)for val in set(diseaseClass.values())]) - listGroup = diseases.keys() - listAID = self.addTypedGroups(typeID['disease'], ((subtypeID['-'] if diseases[diseaseID][1] is None else listSubtype[diseaseClass[diseases[diseaseID][1]]],diseases[diseaseID][0],None) for diseaseID in listGroup)) - groupAID = dict(zip(listGroup,listAID)) - self.log(" OK\n") - - # store diseases names - self.log("writing diseases names to the database ...") - self.addGroupNamespacedNames(namespaceID['disgenet_id'], ((groupAID[diseaseID],diseaseID) for diseaseID in listGroup)) - self.addGroupNamespacedNames(namespaceID['disease'], ((groupAID[diseaseID],diseases[diseaseID][0]) for diseaseID in listGroup)) - diseases = None - diseaseClass = None - self.log(" OK\n") - - # process disgenet disease identifiers - self.log("processing diseases identifiers ...") - diseaseGene = set() - comm = 'SELECT b.geneId,c.diseaseId FROM geneDiseaseNetwork a LEFT JOIN geneAttributes b ON a.geneNID=b.geneNID LEFT JOIN diseaseAttributes c ON a.diseaseNID=c.diseaseNID ORDER BY c.diseaseId' - cur.execute(comm) - diseaseGeneResult = cur.fetchall() - con.close() - numAssoc = 0 - for pair in diseaseGeneResult: - if pair[1] in listGroup: - numAssoc += 1 - diseaseGene.add( (groupAID[pair[1]],numAssoc,pair[0]) ) - self.log(" OK: %d diseases and gene pairs\n" % (len(diseaseGene),)) - - # store gaad disease identifiers - self.log("writing diseases and gene pairs to the database ...") - self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID['entrez_gid'], diseaseGene) - diseaseGene = None - self.log(" OK\n") - - #update() - -#Source_go + + @classmethod + def getVersionString(cls): + return "1.0 (2023-08-08)" + + # getVersionString() + + def download(self, options): + # download the latest source files + self.downloadFilesFromHTTP( + "disgenet.org", + { + "disgenet_2020.db.gz": "/static/disgenet_ap1/files/sqlite_downloads/current/disgenet_2020.db.gz", + }, + ) + + # download() + + def update(self, options): + # clear out all old data from this source + self.log("deleting old records from the database ...") + self.deleteAll() + self.log(" OK\n") + + # get or create the required metadata records + namespaceID = self.addNamespaces( + [("disgenet_id", 0), ("entrez_gid", 0), ("disease", 0)] + ) + typeID = self.addTypes( + [ + ("disease",), + ("gene",), + ] + ) + subtypeID = self.addSubtypes( + [ + ("-",), + ] + ) + + # process disgenet sqlite file + self.log("processing diseases ...") + gunzip("disgenet_2020.db.gz") + diseases = {} + diseaseClass = {} + con = apsw.Connection("disgenet_2020.db") + cur = con.cursor() + comm = "select diseaseClassNID,diseaseClassName from diseaseClass" + cur.execute(comm) + diseaseClass = { + diseaseclass[0]: diseaseclass[1].strip() for diseaseclass in cur.fetchall() + } + comm = "SELECT a.diseaseId,a.diseaseName,b.diseaseClassNID FROM diseaseAttributes a LEFT JOIN disease2class b ON a.diseaseNID=b.diseaseNID order by a.diseaseNID" + cur.execute(comm) + diseases = {disease[0]: [disease[1], disease[2]] for disease in cur.fetchall()} + # foreach line in diseaseFile + self.log(" OK: %d disease\n" % (len(diseases),)) + + # store diseases + self.log("writing diseases to the database ...") + listSubtype = self.addSubtypes([(val,) for val in set(diseaseClass.values())]) + listGroup = diseases.keys() + listAID = self.addTypedGroups( + typeID["disease"], + ( + ( + ( + subtypeID["-"] + if diseases[diseaseID][1] is None + else listSubtype[diseaseClass[diseases[diseaseID][1]]] + ), + diseases[diseaseID][0], + None, + ) + for diseaseID in listGroup + ), + ) + groupAID = dict(zip(listGroup, listAID)) + self.log(" OK\n") + + # store diseases names + self.log("writing diseases names to the database ...") + self.addGroupNamespacedNames( + namespaceID["disgenet_id"], + ((groupAID[diseaseID], diseaseID) for diseaseID in listGroup), + ) + self.addGroupNamespacedNames( + namespaceID["disease"], + ((groupAID[diseaseID], diseases[diseaseID][0]) for diseaseID in listGroup), + ) + diseases = None + diseaseClass = None + self.log(" OK\n") + + # process disgenet disease identifiers + self.log("processing diseases identifiers ...") + diseaseGene = set() + comm = "SELECT b.geneId,c.diseaseId FROM geneDiseaseNetwork a LEFT JOIN geneAttributes b ON a.geneNID=b.geneNID LEFT JOIN diseaseAttributes c ON a.diseaseNID=c.diseaseNID ORDER BY c.diseaseId" + cur.execute(comm) + diseaseGeneResult = cur.fetchall() + con.close() + numAssoc = 0 + for pair in diseaseGeneResult: + if pair[1] in listGroup: + numAssoc += 1 + diseaseGene.add((groupAID[pair[1]], numAssoc, pair[0])) + self.log(" OK: %d diseases and gene pairs\n" % (len(diseaseGene),)) + + # store gaad disease identifiers + self.log("writing diseases and gene pairs to the database ...") + self.addGroupMemberTypedNamespacedNames( + typeID["gene"], namespaceID["entrez_gid"], diseaseGene + ) + diseaseGene = None + self.log(" OK\n") + + # update() + + +# Source_go diff --git a/loki/unsupported_loaders/loki_source_gaad.py b/loki/unsupported_loaders/loki_source_gaad.py index 0ad068a..2b4d406 100644 --- a/loki/unsupported_loaders/loki_source_gaad.py +++ b/loki/unsupported_loaders/loki_source_gaad.py @@ -6,143 +6,166 @@ class Source_gaad(loki_source.Source): - - - @classmethod - def getVersionString(cls): - return '1.0 (2023-06-08)' - #getVersionString() - - - def download(self, options): - # download the latest source files - self.downloadFilesFromHTTPS('gaad.medgenius.info', { - 'diseases2.txt.gz': '/Downloads/diseases2.txt.gz', # disease name by AID - 'disease_relationships.txt.gz': '/Downloads/disease_relationships.txt.gz', - 'disease_association_database_annotations_uniprot_ncbiGene.txt.gz': '/Downloads/disease_association_database_annotations_uniprot_ncbiGene.txt.gz', - 'disease_association_genecards.txt.gz': '/Downloads/disease_association_genecards.txt.gz', - 'disease_gene_association_pubmed_textmining_zhao.txt.gz': '/Downloads/disease_gene_association_pubmed_textmining_zhao.txt.gz', - }) - #download() - - - def update(self, options): - # clear out all old data from this source - self.log("deleting old records from the database ...") - self.deleteAll() - self.log(" OK\n") - - # get or create the required metadata records - namespaceID = self.addNamespaces([ - ('gaad_id', 0), - ('entrez_gid', 0), - ('disease', 0) - ]) - relationshipID = self.addRelationships([ - ('disease_co-occurring',), - ]) - typeID = self.addTypes([ - ('disease',), - ('gene',), - ]) - subtypeID = self.addSubtypes([ - ('-',), - ]) - - # process gaad disease - self.log("processing diseases ...") - diseaseFile = self.zfile('diseases2.txt.gz') - diseases = {} - for line in diseaseFile: - if not line.startswith("AID"): - continue - words = line.split("\t") - diseaseID = words[0] - name = words[1].rstrip() - # store disease name of each disease ID (AID) - diseases[diseaseID] = name - #foreach line in diseaseFile - self.log(" OK: %d disease\n" % (len(diseases),)) - - # store diseases - self.log("writing diseases to the database ...") - listGroup = diseases.keys() - listAID = self.addTypedGroups(typeID['disease'], ((subtypeID['-'],group,diseases[group]) for group in listGroup)) - groupAID = dict(zip(listGroup,listAID)) - self.log(" OK\n") - - # store diseases names - self.log("writing diseases names to the database ...") - self.addGroupNamespacedNames(namespaceID['gaad_id'], ((groupAID[group],group) for group in listGroup)) - self.addGroupNamespacedNames(namespaceID['disease'], ((groupAID[group],diseases[group]) for group in listGroup)) - diseases = None - self.log(" OK\n") - - # process gaad disease relationships - self.log("processing diseases relationships ...") - relationshipFile = self.zfile('disease_relationships.txt.gz') - relationships = [] - num = 0 - for line in relationshipFile: - if line.startswith("disease_uid1"): - continue - words = line.split("\t") - diseaseID = words[0] - diseaseID2 = words[1] - # store disease pairs that shares genes - relationships.append( (diseaseID,diseaseID2,relationshipID['disease_co-occurring'],None) ) - num+=1 - #foreach line in diseaseFile - self.log(" OK: %d disease relationships\n" % (num,)) - - # store gaad disease relationships - self.log("writing diseases relationships to the database ...") - self.addGroupRelationships(relationships) - relationships = None - self.log(" OK\n") - - # process gaad disease identifiers - self.log("processing diseases identifiers ...") - ncbiFile = self.zfile('disease_association_database_annotations_uniprot_ncbiGene.txt.gz') - genecardsFile = self.zfile('disease_association_genecards.txt.gz') - pubmedFile = self.zfile('disease_gene_association_pubmed_textmining_zhao.txt.gz') - diseaseGene = [] - num = 0 - for line in ncbiFile: - if line.startswith("disease_"): - continue - words = line.split("\t") - diseaseID = words[0].strip() - entrezID = words[1].strip() - num+=1 - diseaseGene.append((groupAID[diseaseID], num, entrezID)) - #foreach line in ncbiFile: - for line in genecardsFile: - if line.startswith("disease_"): - continue - words = line.split("\t") - diseaseID = words[0].strip() - entrezID = words[1].strip() - num+=1 - diseaseGene.append((groupAID[diseaseID], num, entrezID)) - #foreach line in genecardsFile: - for line in pubmedFile: - if line.startswith("disease_"): - continue - words = line.split("\t") - diseaseID = words[2].strip() - entrezID = words[1].strip() - num+=1 - diseaseGene.append((groupAID[diseaseID], num, entrezID)) - #foreach line in pubmedFile: - self.log(" OK: %d diseases and gene pairs\n" % (len(diseaseGene),)) - - # store gaad disease identifiers - self.log("writing diseases and gene pairs to the database ...") - self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID['entrez_gid'], diseaseGene) - diseaseGene = None - self.log(" OK\n") - - #update() - -#Source_go + + @classmethod + def getVersionString(cls): + return "1.0 (2023-06-08)" + + # getVersionString() + + def download(self, options): + # download the latest source files + self.downloadFilesFromHTTPS( + "gaad.medgenius.info", + { + "diseases2.txt.gz": "/Downloads/diseases2.txt.gz", # disease name by AID + "disease_relationships.txt.gz": "/Downloads/disease_relationships.txt.gz", + "disease_association_database_annotations_uniprot_ncbiGene.txt.gz": "/Downloads/disease_association_database_annotations_uniprot_ncbiGene.txt.gz", + "disease_association_genecards.txt.gz": "/Downloads/disease_association_genecards.txt.gz", + "disease_gene_association_pubmed_textmining_zhao.txt.gz": "/Downloads/disease_gene_association_pubmed_textmining_zhao.txt.gz", + }, + ) + + # download() + + def update(self, options): + # clear out all old data from this source + self.log("deleting old records from the database ...") + self.deleteAll() + self.log(" OK\n") + + # get or create the required metadata records + namespaceID = self.addNamespaces( + [("gaad_id", 0), ("entrez_gid", 0), ("disease", 0)] + ) + relationshipID = self.addRelationships( + [ + ("disease_co-occurring",), + ] + ) + typeID = self.addTypes( + [ + ("disease",), + ("gene",), + ] + ) + subtypeID = self.addSubtypes( + [ + ("-",), + ] + ) + + # process gaad disease + self.log("processing diseases ...") + diseaseFile = self.zfile("diseases2.txt.gz") + diseases = {} + for line in diseaseFile: + if not line.startswith("AID"): + continue + words = line.split("\t") + diseaseID = words[0] + name = words[1].rstrip() + # store disease name of each disease ID (AID) + diseases[diseaseID] = name + # foreach line in diseaseFile + self.log(" OK: %d disease\n" % (len(diseases),)) + + # store diseases + self.log("writing diseases to the database ...") + listGroup = diseases.keys() + listAID = self.addTypedGroups( + typeID["disease"], + ((subtypeID["-"], group, diseases[group]) for group in listGroup), + ) + groupAID = dict(zip(listGroup, listAID)) + self.log(" OK\n") + + # store diseases names + self.log("writing diseases names to the database ...") + self.addGroupNamespacedNames( + namespaceID["gaad_id"], ((groupAID[group], group) for group in listGroup) + ) + self.addGroupNamespacedNames( + namespaceID["disease"], + ((groupAID[group], diseases[group]) for group in listGroup), + ) + diseases = None + self.log(" OK\n") + + # process gaad disease relationships + self.log("processing diseases relationships ...") + relationshipFile = self.zfile("disease_relationships.txt.gz") + relationships = [] + num = 0 + for line in relationshipFile: + if line.startswith("disease_uid1"): + continue + words = line.split("\t") + diseaseID = words[0] + diseaseID2 = words[1] + # store disease pairs that shares genes + relationships.append( + (diseaseID, diseaseID2, relationshipID["disease_co-occurring"], None) + ) + num += 1 + # foreach line in diseaseFile + self.log(" OK: %d disease relationships\n" % (num,)) + + # store gaad disease relationships + self.log("writing diseases relationships to the database ...") + self.addGroupRelationships(relationships) + relationships = None + self.log(" OK\n") + + # process gaad disease identifiers + self.log("processing diseases identifiers ...") + ncbiFile = self.zfile( + "disease_association_database_annotations_uniprot_ncbiGene.txt.gz" + ) + genecardsFile = self.zfile("disease_association_genecards.txt.gz") + pubmedFile = self.zfile( + "disease_gene_association_pubmed_textmining_zhao.txt.gz" + ) + diseaseGene = [] + num = 0 + for line in ncbiFile: + if line.startswith("disease_"): + continue + words = line.split("\t") + diseaseID = words[0].strip() + entrezID = words[1].strip() + num += 1 + diseaseGene.append((groupAID[diseaseID], num, entrezID)) + # foreach line in ncbiFile: + for line in genecardsFile: + if line.startswith("disease_"): + continue + words = line.split("\t") + diseaseID = words[0].strip() + entrezID = words[1].strip() + num += 1 + diseaseGene.append((groupAID[diseaseID], num, entrezID)) + # foreach line in genecardsFile: + for line in pubmedFile: + if line.startswith("disease_"): + continue + words = line.split("\t") + diseaseID = words[2].strip() + entrezID = words[1].strip() + num += 1 + diseaseGene.append((groupAID[diseaseID], num, entrezID)) + # foreach line in pubmedFile: + self.log(" OK: %d diseases and gene pairs\n" % (len(diseaseGene),)) + + # store gaad disease identifiers + self.log("writing diseases and gene pairs to the database ...") + self.addGroupMemberTypedNamespacedNames( + typeID["gene"], namespaceID["entrez_gid"], diseaseGene + ) + diseaseGene = None + self.log(" OK\n") + + # update() + + +# Source_go diff --git a/loki/unsupported_loaders/loki_source_kegg.py b/loki/unsupported_loaders/loki_source_kegg.py index 5aa2e66..2464d0b 100644 --- a/loki/unsupported_loaders/loki_source_kegg.py +++ b/loki/unsupported_loaders/loki_source_kegg.py @@ -6,220 +6,259 @@ class Source_kegg(loki_source.Source): - - - @classmethod - def getVersionString(cls): - return '2.0 (2013-02-14)' - #getVersionString() - - - @classmethod - def getOptions(cls): - return { - 'api': '[rest|cache] -- use the new REST API, or a local file cache (default: rest)' - } - #getOptions() - - - def validateOptions(self, options): - for o,v in options.items(): - if o == 'api': - v = v.strip().lower() - if 'rest'.startswith(v): - v = 'rest' - elif 'cache'.startswith(v): - v = 'cache' - else: - return "api must be 'rest', or 'cache'" - options[o] = v - else: - return "unexpected option '%s'" % o - return True - #validateOptions() - - - def download(self, options): - if (options.get('api') == 'cache'): - # do nothing, update() will just expect the files to already be there - pass - else: # api==rest - self.downloadFilesFromHTTP('rest.kegg.jp', { - 'list-pathway-hsa': '/list/pathway/hsa', - 'link-pathway-hsa': '/link/pathway/hsa', - 'list-disease': '/list/disease', - 'link-disease-hsa': '/link/disease/hsa', - 'category-pathway': '/get/br:br08901/json', - 'category-disease': '/get/br:br08403/json', - }) - #if api==rest/cache - #download() - - - def update(self, options): - # clear out all old data from this source - self.log("deleting old records from the database ...") - self.deleteAll() - self.log(" OK\n") - - # get or create the required metadata records - namespaceID = self.addNamespaces([ - ('kegg_id', 0), - ('pathway', 0), - ('entrez_gid', 0), - ('disease', 0) - ]) - typeID = self.addTypes([ - ('pathway',), - ('gene',), - ('disease',), - ]) - - # process pathways - self.log("processing pathways ...") - #read pathway categories json file into pathCategory - pathCategory = [] - with open(r'category-pathway') as pathCategoryFile: - pathCategory = json.load(pathCategoryFile) - #store subtypes into pathSubtype - pathSubtype = {} - for category in pathCategory['children']: - for category2 in category['children']: - if category2['name']=='Global and overview maps' or category2['name']=='Carbohydrate metabolism' or category2['name']=='Energy metabolism' or category2['name']=='Immune system' or category2['name']=='Endocrine system': - continue - for category3 in category2['children']: - line = category3['name'].split(" ") - pathID = "hsa"+line[0] - pathSubtype[pathID] = category2['name'] - pathCategory = None - #with pathCategory - pathName = {} - with open('list-pathway-hsa','rU') as pathFile: - for line in pathFile: - words = line.split("\t") - pathID = words[0] - if pathID not in pathSubtype: - pathSubtype[pathID] = "-" - name = words[1].rstrip() - if name.endswith(" - Homo sapiens (human)"): - name = name[:-23] - pathName[pathID] = name - #foreach line in pathFile - #with pathFile - self.log(" OK: %d pathways\n" % (len(pathName),)) - - # store pathways - self.log("writing pathways to the database ...") - listPath = pathName.keys() - listSubtype = self.addSubtypes([(val,)for val in set(pathSubtype.values())]) - listGID = self.addTypedGroups(typeID['pathway'], ((listSubtype[pathSubtype[pathID]],pathName[pathID],None) for pathID in listPath)) - pathGID = dict(zip(listPath,listGID)) - self.log(" OK\n") - - # store pathway names - self.log("writing pathway names to the database ...") - self.addGroupNamespacedNames(namespaceID['kegg_id'], ((pathGID[pathID],pathID) for pathID in listPath)) - self.addGroupNamespacedNames(namespaceID['pathway'], ((pathGID[pathID],pathName[pathID]) for pathID in listPath)) - self.log(" OK\n") - pathName = None - listPath = None - - # process associations - self.log("processing pathway gene associations ...") - entrezAssoc = set() - numAssoc = 0 - with open('link-pathway-hsa','rU') as assocFile: - for line in assocFile: - words = line.split("\t") - hsaGene = words[0] - pathID = words[1].strip().replace("path:hsa","hsa") - if pathID in pathGID: - numAssoc += 1 - entrezAssoc.add( (pathGID[pathID],numAssoc,hsaGene[4:]) ) - #if pathway and gene are ok - #foreach line in assocFile - #with assocFile - self.log(" OK: %d associations\n" % (numAssoc,)) - listSubtype = None - pathGID = None - - # store gene associations - self.log("writing gene associations to the database ...") - self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID['entrez_gid'], entrezAssoc) - self.log(" OK\n") - entrezAssoc = None - - # process diseases - self.log("processing diseases ...") - #read disease categories json file into diseaseCategory - diseaseCategory = [] - with open(r'category-disease') as diseaseCategoryFile: - diseaseCategory = json.load(diseaseCategoryFile) - #store subtypes into diseaseSubtype - diseaseSubtype = {} - for category in diseaseCategory['children']: - for category2 in category['children']: - if 'children' not in category2: - continue - for category3 in category2['children']: - if 'children' not in category3: - continue - for category4 in category3['children']: - line = category4['name'] - if not line.startswith("H"): - continue; - diseaseID = line.split(" ")[0] - diseaseSubtype[diseaseID] = category2['name'] - diseaseCategory = None - #with diseaseCategory - diseaseName = {} - with open('list-disease','rU') as pathFile: - for line in pathFile: - words = line.split("\t") - pathID = words[0] - if pathID not in diseaseSubtype: - diseaseSubtype[pathID] = "-" - name = words[1].rstrip() - diseaseName[pathID] = name - #foreach line in diseaseFile - #with diseaseFile - self.log(" OK: %d diseases\n" % (len(diseaseName),)) - - # store diseases - self.log("writing diseases to the database ...") - listDisease = diseaseName.keys() - listSubtype = self.addSubtypes([(val,)for val in set(diseaseSubtype.values())]) - listGID = self.addTypedGroups(typeID['disease'], ((listSubtype[diseaseSubtype[diseaseID]],diseaseName[diseaseID],None) for diseaseID in listDisease)) - diseaseGID = dict(zip(listDisease,listGID)) - self.log(" OK\n") - - # store disease names - self.log("writing disease names to the database ...") - self.addGroupNamespacedNames(namespaceID['kegg_id'], ((diseaseGID[diseaseID],diseaseID) for diseaseID in listDisease)) - self.addGroupNamespacedNames(namespaceID['disease'], ((diseaseGID[diseaseID],diseaseName[diseaseID]) for diseaseID in listDisease)) - self.log(" OK\n") - - # process disease & gene associations - self.log("processing disease gene associations ...") - entrezAssoc = set() - numAssoc = 0 - with open('link-disease-hsa','rU') as assocFile: - for line in assocFile: - words = line.split("\t") - hsaGene = words[0] - diseaseID = words[1].strip()[3:] - if diseaseID in diseaseGID: - numAssoc += 1 - entrezAssoc.add( (diseaseGID[diseaseID],numAssoc,hsaGene[4:]) ) - #foreach line in assocFile - #with assocFile - self.log(" OK: %d associations\n" % (numAssoc,)) - - # store gene associations - self.log("writing gene associations to the database ...") - self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID['entrez_gid'], entrezAssoc) - self.log(" OK\n") - entrezAssoc = None - #update() - -#Source_kegg + + @classmethod + def getVersionString(cls): + return "2.0 (2013-02-14)" + + # getVersionString() + + @classmethod + def getOptions(cls): + return { + "api": "[rest|cache] -- use the new REST API, or a local file cache (default: rest)" + } + + # getOptions() + + def validateOptions(self, options): + for o, v in options.items(): + if o == "api": + v = v.strip().lower() + if "rest".startswith(v): + v = "rest" + elif "cache".startswith(v): + v = "cache" + else: + return "api must be 'rest', or 'cache'" + options[o] = v + else: + return "unexpected option '%s'" % o + return True + + # validateOptions() + + def download(self, options): + if options.get("api") == "cache": + # do nothing, update() will just expect the files to already be there + pass + else: # api==rest + self.downloadFilesFromHTTP( + "rest.kegg.jp", + { + "list-pathway-hsa": "/list/pathway/hsa", + "link-pathway-hsa": "/link/pathway/hsa", + "list-disease": "/list/disease", + "link-disease-hsa": "/link/disease/hsa", + "category-pathway": "/get/br:br08901/json", + "category-disease": "/get/br:br08403/json", + }, + ) + # if api==rest/cache + + # download() + + def update(self, options): + # clear out all old data from this source + self.log("deleting old records from the database ...") + self.deleteAll() + self.log(" OK\n") + + # get or create the required metadata records + namespaceID = self.addNamespaces( + [("kegg_id", 0), ("pathway", 0), ("entrez_gid", 0), ("disease", 0)] + ) + typeID = self.addTypes( + [ + ("pathway",), + ("gene",), + ("disease",), + ] + ) + + # process pathways + self.log("processing pathways ...") + # read pathway categories json file into pathCategory + pathCategory = [] + with open(r"category-pathway") as pathCategoryFile: + pathCategory = json.load(pathCategoryFile) + # store subtypes into pathSubtype + pathSubtype = {} + for category in pathCategory["children"]: + for category2 in category["children"]: + if ( + category2["name"] == "Global and overview maps" + or category2["name"] == "Carbohydrate metabolism" + or category2["name"] == "Energy metabolism" + or category2["name"] == "Immune system" + or category2["name"] == "Endocrine system" + ): + continue + for category3 in category2["children"]: + line = category3["name"].split(" ") + pathID = "hsa" + line[0] + pathSubtype[pathID] = category2["name"] + pathCategory = None + # with pathCategory + pathName = {} + with open("list-pathway-hsa", "rU") as pathFile: + for line in pathFile: + words = line.split("\t") + pathID = words[0] + if pathID not in pathSubtype: + pathSubtype[pathID] = "-" + name = words[1].rstrip() + if name.endswith(" - Homo sapiens (human)"): + name = name[:-23] + pathName[pathID] = name + # foreach line in pathFile + # with pathFile + self.log(" OK: %d pathways\n" % (len(pathName),)) + + # store pathways + self.log("writing pathways to the database ...") + listPath = pathName.keys() + listSubtype = self.addSubtypes([(val,) for val in set(pathSubtype.values())]) + listGID = self.addTypedGroups( + typeID["pathway"], + ( + (listSubtype[pathSubtype[pathID]], pathName[pathID], None) + for pathID in listPath + ), + ) + pathGID = dict(zip(listPath, listGID)) + self.log(" OK\n") + + # store pathway names + self.log("writing pathway names to the database ...") + self.addGroupNamespacedNames( + namespaceID["kegg_id"], ((pathGID[pathID], pathID) for pathID in listPath) + ) + self.addGroupNamespacedNames( + namespaceID["pathway"], + ((pathGID[pathID], pathName[pathID]) for pathID in listPath), + ) + self.log(" OK\n") + pathName = None + listPath = None + + # process associations + self.log("processing pathway gene associations ...") + entrezAssoc = set() + numAssoc = 0 + with open("link-pathway-hsa", "rU") as assocFile: + for line in assocFile: + words = line.split("\t") + hsaGene = words[0] + pathID = words[1].strip().replace("path:hsa", "hsa") + if pathID in pathGID: + numAssoc += 1 + entrezAssoc.add((pathGID[pathID], numAssoc, hsaGene[4:])) + # if pathway and gene are ok + # foreach line in assocFile + # with assocFile + self.log(" OK: %d associations\n" % (numAssoc,)) + listSubtype = None + pathGID = None + + # store gene associations + self.log("writing gene associations to the database ...") + self.addGroupMemberTypedNamespacedNames( + typeID["gene"], namespaceID["entrez_gid"], entrezAssoc + ) + self.log(" OK\n") + entrezAssoc = None + + # process diseases + self.log("processing diseases ...") + # read disease categories json file into diseaseCategory + diseaseCategory = [] + with open(r"category-disease") as diseaseCategoryFile: + diseaseCategory = json.load(diseaseCategoryFile) + # store subtypes into diseaseSubtype + diseaseSubtype = {} + for category in diseaseCategory["children"]: + for category2 in category["children"]: + if "children" not in category2: + continue + for category3 in category2["children"]: + if "children" not in category3: + continue + for category4 in category3["children"]: + line = category4["name"] + if not line.startswith("H"): + continue + diseaseID = line.split(" ")[0] + diseaseSubtype[diseaseID] = category2["name"] + diseaseCategory = None + # with diseaseCategory + diseaseName = {} + with open("list-disease", "rU") as pathFile: + for line in pathFile: + words = line.split("\t") + pathID = words[0] + if pathID not in diseaseSubtype: + diseaseSubtype[pathID] = "-" + name = words[1].rstrip() + diseaseName[pathID] = name + # foreach line in diseaseFile + # with diseaseFile + self.log(" OK: %d diseases\n" % (len(diseaseName),)) + + # store diseases + self.log("writing diseases to the database ...") + listDisease = diseaseName.keys() + listSubtype = self.addSubtypes([(val,) for val in set(diseaseSubtype.values())]) + listGID = self.addTypedGroups( + typeID["disease"], + ( + (listSubtype[diseaseSubtype[diseaseID]], diseaseName[diseaseID], None) + for diseaseID in listDisease + ), + ) + diseaseGID = dict(zip(listDisease, listGID)) + self.log(" OK\n") + + # store disease names + self.log("writing disease names to the database ...") + self.addGroupNamespacedNames( + namespaceID["kegg_id"], + ((diseaseGID[diseaseID], diseaseID) for diseaseID in listDisease), + ) + self.addGroupNamespacedNames( + namespaceID["disease"], + ( + (diseaseGID[diseaseID], diseaseName[diseaseID]) + for diseaseID in listDisease + ), + ) + self.log(" OK\n") + + # process disease & gene associations + self.log("processing disease gene associations ...") + entrezAssoc = set() + numAssoc = 0 + with open("link-disease-hsa", "rU") as assocFile: + for line in assocFile: + words = line.split("\t") + hsaGene = words[0] + diseaseID = words[1].strip()[3:] + if diseaseID in diseaseGID: + numAssoc += 1 + entrezAssoc.add((diseaseGID[diseaseID], numAssoc, hsaGene[4:])) + # foreach line in assocFile + # with assocFile + self.log(" OK: %d associations\n" % (numAssoc,)) + + # store gene associations + self.log("writing gene associations to the database ...") + self.addGroupMemberTypedNamespacedNames( + typeID["gene"], namespaceID["entrez_gid"], entrezAssoc + ) + self.log(" OK\n") + entrezAssoc = None + + # update() + + +# Source_kegg diff --git a/loki/unsupported_loaders/loki_source_netpath.py b/loki/unsupported_loaders/loki_source_netpath.py index 7f351ec..1e05e4c 100644 --- a/loki/unsupported_loaders/loki_source_netpath.py +++ b/loki/unsupported_loaders/loki_source_netpath.py @@ -5,107 +5,138 @@ class Source_netpath(loki_source.Source): - - - @classmethod - def getVersionString(cls): - return '2.0 (2013-02-14)' - #getVersionString() - - - def download(self, options): - # download the latest source files - self.downloadFilesFromHTTP('www.netpath.org', { - # 'NetPath_GeneReg_TSV.zip': '/data/batch/NetPath_GeneReg_TSV.zip', #Last-Modified: Fri, 31 Oct 2008 17:00:16 GMT - 'NetPath_GeneReg_TSV1.zip': '/data/batch/NetPath_GeneReg_TSV1.zip', #Last-Modified: Sat, 03 Sep 2011 10:07:03 GMT - }) - #download() - - - def update(self, options): - # clear out all old data from this source - self.log("deleting old records from the database ...") - self.deleteAll() - self.log(" OK\n") - - # get or create the required metadata records - namespaceID = self.addNamespaces([ - ('netpath_id', 0), - ('pathway', 0), - ('symbol', 0), - ('entrez_gid', 0), - ]) - typeID = self.addTypes([ - ('pathway',), - ('gene',), - ]) - subtypeID = self.addSubtypes([ - ('-',), - ]) - - # process pathways and associations - self.log("verifying archive ...") - pathName = {} - nsAssoc = { - 'symbol' : set(), - 'entrez_gid' : set(), - } - numAssoc = 0 - with zipfile.ZipFile('NetPath_GeneReg_TSV1.zip','r') as pathZip: - err = pathZip.testzip() - if err: - self.log(" ERROR\n") - self.log("CRC failed for %s\n" % err) - return False - self.log(" OK\n") - self.log("processing pathways ...") - for info in pathZip.infolist(): - # there should be only one, but just in case.. - if info.filename == 'NetPath_Gene_regulation_all.txt': - pathFile = pathZip.open(info,'r') - header = pathFile.__next__().rstrip() - if not header.decode().startswith("Gene regulation id Pathway name Pathway ID Gene name Entrez gene ID"): # Regulation Experiment PubMed ID - self.log(" ERROR\n") - self.log("unrecognized file header in '%s': %s\n" % (info.filename,header)) - return False - for line in pathFile: - words = line.decode('latin-1').split("\t") - pathway = words[1] - pathID = words[2] - gene = words[3].strip() - entrezID = words[4] - - pathName[pathID] = pathway - numAssoc += 1 - nsAssoc['entrez_gid'].add( (pathID,numAssoc,entrezID) ) - nsAssoc['symbol'].add( (pathID,numAssoc,gene) ) - #foreach line in pathFile - pathFile.close() - #if file is the one we want - #foreach file in pathZip - #with pathZip - numPathways = len(pathName) - numID = sum(len(nsAssoc[ns]) for ns in nsAssoc) - self.log(" OK: %d pathways, %d associations (%d identifiers)\n" % (numPathways,numAssoc,numID)) - - # store pathways - self.log("writing pathways to the database ...") - listPath = pathName.keys() - listGID = self.addTypedGroups(typeID['pathway'], ((subtypeID['-'], pathName[pathID],None) for pathID in listPath)) - pathGID = dict(zip(listPath,listGID)) - self.log(" OK\n") - - # store pathway names - self.log("writing pathway names to the database ...") - self.addGroupNamespacedNames(namespaceID['netpath_id'], ((pathGID[pathID],pathID) for pathID in listPath)) - self.addGroupNamespacedNames(namespaceID['pathway'], ((pathGID[pathID],pathName[pathID]) for pathID in listPath)) - self.log(" OK\n") - - # store gene associations - self.log("writing gene associations to the database ...") - for ns in nsAssoc: - self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID[ns], ((pathGID[assoc[0]],assoc[1],assoc[2]) for assoc in nsAssoc[ns])) - self.log(" OK\n") - #update() - -#Source_netpath + + @classmethod + def getVersionString(cls): + return "2.0 (2013-02-14)" + + # getVersionString() + + def download(self, options): + # download the latest source files + self.downloadFilesFromHTTP( + "www.netpath.org", + { + # 'NetPath_GeneReg_TSV.zip': '/data/batch/NetPath_GeneReg_TSV.zip', #Last-Modified: Fri, 31 Oct 2008 17:00:16 GMT + "NetPath_GeneReg_TSV1.zip": "/data/batch/NetPath_GeneReg_TSV1.zip", # Last-Modified: Sat, 03 Sep 2011 10:07:03 GMT + }, + ) + + # download() + + def update(self, options): + # clear out all old data from this source + self.log("deleting old records from the database ...") + self.deleteAll() + self.log(" OK\n") + + # get or create the required metadata records + namespaceID = self.addNamespaces( + [ + ("netpath_id", 0), + ("pathway", 0), + ("symbol", 0), + ("entrez_gid", 0), + ] + ) + typeID = self.addTypes( + [ + ("pathway",), + ("gene",), + ] + ) + subtypeID = self.addSubtypes( + [ + ("-",), + ] + ) + + # process pathways and associations + self.log("verifying archive ...") + pathName = {} + nsAssoc = { + "symbol": set(), + "entrez_gid": set(), + } + numAssoc = 0 + with zipfile.ZipFile("NetPath_GeneReg_TSV1.zip", "r") as pathZip: + err = pathZip.testzip() + if err: + self.log(" ERROR\n") + self.log("CRC failed for %s\n" % err) + return False + self.log(" OK\n") + self.log("processing pathways ...") + for info in pathZip.infolist(): + # there should be only one, but just in case.. + if info.filename == "NetPath_Gene_regulation_all.txt": + pathFile = pathZip.open(info, "r") + header = pathFile.__next__().rstrip() + if not header.decode().startswith( + "Gene regulation id Pathway name Pathway ID Gene name Entrez gene ID" + ): # Regulation Experiment PubMed ID + self.log(" ERROR\n") + self.log( + "unrecognized file header in '%s': %s\n" + % (info.filename, header) + ) + return False + for line in pathFile: + words = line.decode("latin-1").split("\t") + pathway = words[1] + pathID = words[2] + gene = words[3].strip() + entrezID = words[4] + + pathName[pathID] = pathway + numAssoc += 1 + nsAssoc["entrez_gid"].add((pathID, numAssoc, entrezID)) + nsAssoc["symbol"].add((pathID, numAssoc, gene)) + # foreach line in pathFile + pathFile.close() + # if file is the one we want + # foreach file in pathZip + # with pathZip + numPathways = len(pathName) + numID = sum(len(nsAssoc[ns]) for ns in nsAssoc) + self.log( + " OK: %d pathways, %d associations (%d identifiers)\n" + % (numPathways, numAssoc, numID) + ) + + # store pathways + self.log("writing pathways to the database ...") + listPath = pathName.keys() + listGID = self.addTypedGroups( + typeID["pathway"], + ((subtypeID["-"], pathName[pathID], None) for pathID in listPath), + ) + pathGID = dict(zip(listPath, listGID)) + self.log(" OK\n") + + # store pathway names + self.log("writing pathway names to the database ...") + self.addGroupNamespacedNames( + namespaceID["netpath_id"], + ((pathGID[pathID], pathID) for pathID in listPath), + ) + self.addGroupNamespacedNames( + namespaceID["pathway"], + ((pathGID[pathID], pathName[pathID]) for pathID in listPath), + ) + self.log(" OK\n") + + # store gene associations + self.log("writing gene associations to the database ...") + for ns in nsAssoc: + self.addGroupMemberTypedNamespacedNames( + typeID["gene"], + namespaceID[ns], + ((pathGID[assoc[0]], assoc[1], assoc[2]) for assoc in nsAssoc[ns]), + ) + self.log(" OK\n") + + # update() + + +# Source_netpath diff --git a/loki/util/liftOver.py b/loki/util/liftOver.py index eb47e11..ba75f89 100644 --- a/loki/util/liftOver.py +++ b/loki/util/liftOver.py @@ -10,344 +10,371 @@ # Docstring has not been inspected line by line ################################################## + class liftOver(object): - """ - A class for lifting over genomic coordinates between assemblies. - - This class provides methods to map genomic regions from one assembly - (old_ucschg) to another (new_ucschg) using chain data stored in a database. - - Attributes: - ----------- - _db : loki_db.Database - Instance of the LOKI database used for storing chain data. - _old_ucschg : int - Version of the old assembly (e.g., 19). - _new_ucschg : int - Version of the new assembly (e.g., 38). - _cached : bool - Flag indicating whether to use cached chain data for optimization. - _minFrac : float - Minimum fraction of the region that must be mapped for successful liftOver. - - Methods: - -------- - __init__(db, old_ucschg, new_ucschg, cached=False): - Initializes a liftOver object with the provided parameters. - - _initChains(): - Initializes the cached chain data from the database. - - _findChains(chrom, start, end): - Finds chain segments that overlap with the given region. - - liftRegion(chrom, start, end): - Lifts a genomic region from old_ucschg to new_ucschg assembly. - - _mapRegion(region, first_seg, end_seg, total_mapped_sz): - Maps a region using chain segment data. - - Notes: - ------ - This class assumes chain data is stored in the LOKI database and uses - this data to perform liftOver operations between assemblies. - """ - - def __init__(self, db, old_ucschg, new_ucschg, cached=False): - """ - Initializes a liftOver object with the provided parameters. - - Parameters: - ----------- - db : loki_db.Database - Instance of the LOKI database containing chain data. - old_ucschg : int - Version of the old assembly (e.g., 19). - new_ucschg : int - Version of the new assembly (e.g., 38). - cached : bool, optional - Flag indicating whether to use cached chain data (default is False). - """ - # db is a loki_db.Database object - self._db = db - self._old_ucschg = old_ucschg - self._new_ucschg = new_ucschg - self._cached = cached - self._minFrac = 0.95 - if self._cached: - self._cached_data = {} - self._cached_keys = {} - self._chainData = self._initChains() - - def _initChains(self): - """ - Initializes the cached chain data from the database. - - This method constructs a cached representation of chain data for - optimized region mapping. - """ - for row in self._db._db.cursor().execute("SELECT chain_id, old_chr, score, chain.old_start, " + - "chain.old_end, chain.new_start, is_fwd, new_chr, " + - "chain_data.old_start, chain_data.old_end, chain_data.new_start " + - "FROM db.chain INNER JOIN db.chain_data USING (chain_id) " + - "WHERE old_ucschg=? AND new_ucschg=?" + - "ORDER BY old_chr, score DESC, chain_data.old_start", - (self._old_ucschg,self._new_ucschg)): - - chain = (row[2], row[3], row[4], row[5], row[6], row[7], row[0]) - chr = row[1] - - if chr not in self._cached_data: - self._cached_data[chr] = {chain: []} - self._cached_keys[chr] = [chain] - elif chain not in self._cached_data[chr]: - self._cached_data[chr][chain] = [] - self._cached_keys[chr].append(chain) - - self._cached_data[chr][chain].append((row[8],row[9],row[10])) - - # Sort the chains by score - for k in self._cached_keys: - self._cached_keys[k].sort(reverse=True) - - - - def _findChains(self, chrom, start, end): - """ - Finds chain segments that overlap with the given region. - - Parameters: - ----------- - chrom : str - Chromosome name or identifier. - start : int - Start position of the region. - end : int - End position of the region. - - Yields: - ------ - tuple: - Chain segment details including chain_id, old_start, old_end, - new_start, is_fwd, new_chr. - - Notes: - ------ - This method queries the database or uses cached data to find chain - segments that overlap with the specified region. - """ - if not self._cached: - for row in self._db._db.cursor().execute( - "SELECT chain.chain_id, chain_data.old_start, chain_data.old_end, chain_data.new_start, is_fwd, new_chr " + - "FROM chain INNER JOIN chain_data ON chain.chain_id = chain_data.chain_id " + - "WHERE old_ucschg=? AND new_ucschg=? AND old_chr=? AND chain.old_end>=? AND chain.old_start=? AND chain_data.old_start= c[1]: - data = self._cached_data[chrom][c] - idx = bisect.bisect(data, (start, sys.maxsize, sys.maxsize)) - if idx: - idx = idx-1 - - if idx < len(data) - 1 and start == data[idx + 1]: - idx = idx + 1 - - while idx < len(data) and data[idx][0] < end: - yield (c[-1], data[idx][0], data[idx][1], data[idx][2], c[4], c[5]) - idx = idx + 1 - - - def liftRegion(self, chrom, start, end): - """ - Lifts a genomic region from old_ucschg to new_ucschg assembly. - - Parameters: - ----------- - chrom : str - Chromosome name or identifier. - start : int - Start position of the region. - end : int - End position of the region. - - Returns: - -------- - tuple or None: - Mapped region (new_chr, new_start, new_end) or None if unable to map. - - Notes: - ------ - This method uses chain data to map the specified genomic region from - the old_ucschg assembly to the new_ucschg assembly. - """ - # We need to actually lift regions to detect dropped sections - is_region = True - - # If the start and end are swapped, reverse them, please - if start > end: - (start, end) = (end, start) - elif start == end: - is_region = False - end = start + 1 - - ch_list = self._findChains(chrom, start, end) - - # This will be a tuple of (start, end) of the mapped region - # If the function returns "None", then it was unable to map - # the region into the new assembly - mapped_reg = None - - curr_chain = None - - total_mapped_sz = 0 - first_seg = None - end_seg = None - for seg in ch_list: - if curr_chain is None: - curr_chain = seg[0] - first_seg = seg - end_seg = seg - total_mapped_sz = seg[2] - seg[1] - elif seg[0] != curr_chain: - mapped_reg = self._mapRegion((start, end), first_seg, end_seg, total_mapped_sz) - if not mapped_reg: - first_seg = seg - end_seg = seg - total_mapped_sz = seg[2] - seg[1] - else: - break - else: - end_seg = seg - total_mapped_sz = total_mapped_sz + seg[2] - seg[1] - - if not mapped_reg and first_seg is not None: - mapped_reg = self._mapRegion((start, end), first_seg, end_seg, total_mapped_sz) - - if mapped_reg and not is_region: - mapped_reg = (mapped_reg[0], mapped_reg[1], mapped_reg[1]) #bug? - - return mapped_reg - - - - - def _mapRegion(self, region, first_seg, end_seg, total_mapped_sz): - """ - Maps a region using chain segment data. - - Parameters: - ----------- - region : tuple - Genomic region (start, end) to map. - first_seg : tuple - First segment of the chain (chain_id, old_start, old_end, new_start, is_fwd, new_chr). - end_seg : tuple - Last segment of the chain (chain_id, old_start, old_end, new_start, is_fwd, new_chr). - total_mapped_sz : int - Total size of mapped segments. - - Returns: - -------- - tuple or None: - Mapped region (new_chr, new_start, new_end) or None if unable to map. - - Notes: - ------ - This method calculates the mapped region based on the chain segments - and verifies if the mapped fraction meets the minimum required. - """ - mapped_reg = None - - # The front and end differences are the distances from the - # beginning of the segment. - - # The front difference should be >= 0 and <= size of 1st segment - front_diff = max(0, min(region[0] - first_seg[1], first_seg[2] - first_seg[1])) - - # The end difference should be similar, but w/ last - end_diff = max(0, min(region[1] - end_seg[1], end_seg[2] - end_seg[1])) - - # Now, if we are moving forward, we add the difference - # to the new_start, backward, we subtract - # Also, at this point, if backward, swap start/end - if first_seg[4]: - new_start = first_seg[3] + front_diff - new_end = end_seg[3] + end_diff - else: - new_start = end_seg[3] - end_diff - new_end = first_seg[3] - front_diff - - # old_startHere, detect if we have mapped a sufficient fraction - # of the region. liftOver uses a default of 95% - mapped_size = total_mapped_sz - front_diff - (end_seg[2] - end_seg[1]) + end_diff - - if mapped_size / float(region[1] - region[0]) >= self._minFrac: - mapped_reg = (first_seg[5], new_start, new_end) - - return mapped_reg + """ + A class for lifting over genomic coordinates between assemblies. + + This class provides methods to map genomic regions from one assembly + (old_ucschg) to another (new_ucschg) using chain data stored in a database. + + Attributes: + ----------- + _db : loki_db.Database + Instance of the LOKI database used for storing chain data. + _old_ucschg : int + Version of the old assembly (e.g., 19). + _new_ucschg : int + Version of the new assembly (e.g., 38). + _cached : bool + Flag indicating whether to use cached chain data for optimization. + _minFrac : float + Minimum fraction of the region that must be mapped for successful liftOver. + + Methods: + -------- + __init__(db, old_ucschg, new_ucschg, cached=False): + Initializes a liftOver object with the provided parameters. + + _initChains(): + Initializes the cached chain data from the database. + + _findChains(chrom, start, end): + Finds chain segments that overlap with the given region. + + liftRegion(chrom, start, end): + Lifts a genomic region from old_ucschg to new_ucschg assembly. + + _mapRegion(region, first_seg, end_seg, total_mapped_sz): + Maps a region using chain segment data. + + Notes: + ------ + This class assumes chain data is stored in the LOKI database and uses + this data to perform liftOver operations between assemblies. + """ + + def __init__(self, db, old_ucschg, new_ucschg, cached=False): + """ + Initializes a liftOver object with the provided parameters. + + Parameters: + ----------- + db : loki_db.Database + Instance of the LOKI database containing chain data. + old_ucschg : int + Version of the old assembly (e.g., 19). + new_ucschg : int + Version of the new assembly (e.g., 38). + cached : bool, optional + Flag indicating whether to use cached chain data (default is False). + """ + # db is a loki_db.Database object + self._db = db + self._old_ucschg = old_ucschg + self._new_ucschg = new_ucschg + self._cached = cached + self._minFrac = 0.95 + if self._cached: + self._cached_data = {} + self._cached_keys = {} + self._chainData = self._initChains() + + def _initChains(self): + """ + Initializes the cached chain data from the database. + + This method constructs a cached representation of chain data for + optimized region mapping. + """ + for row in self._db._db.cursor().execute( + "SELECT chain_id, old_chr, score, chain.old_start, " + + "chain.old_end, chain.new_start, is_fwd, new_chr, " + + "chain_data.old_start, chain_data.old_end, chain_data.new_start " + + "FROM db.chain INNER JOIN db.chain_data USING (chain_id) " + + "WHERE old_ucschg=? AND new_ucschg=?" + + "ORDER BY old_chr, score DESC, chain_data.old_start", + (self._old_ucschg, self._new_ucschg), + ): + + chain = (row[2], row[3], row[4], row[5], row[6], row[7], row[0]) + chr = row[1] + + if chr not in self._cached_data: + self._cached_data[chr] = {chain: []} + self._cached_keys[chr] = [chain] + elif chain not in self._cached_data[chr]: + self._cached_data[chr][chain] = [] + self._cached_keys[chr].append(chain) + + self._cached_data[chr][chain].append((row[8], row[9], row[10])) + + # Sort the chains by score + for k in self._cached_keys: + self._cached_keys[k].sort(reverse=True) + + def _findChains(self, chrom, start, end): + """ + Finds chain segments that overlap with the given region. + + Parameters: + ----------- + chrom : str + Chromosome name or identifier. + start : int + Start position of the region. + end : int + End position of the region. + + Yields: + ------ + tuple: + Chain segment details including chain_id, old_start, old_end, + new_start, is_fwd, new_chr. + + Notes: + ------ + This method queries the database or uses cached data to find chain + segments that overlap with the specified region. + """ + if not self._cached: + for row in self._db._db.cursor().execute( + "SELECT chain.chain_id, chain_data.old_start, chain_data.old_end, chain_data.new_start, is_fwd, new_chr " + + "FROM chain INNER JOIN chain_data ON chain.chain_id = chain_data.chain_id " + + "WHERE old_ucschg=? AND new_ucschg=? AND old_chr=? AND chain.old_end>=? AND chain.old_start=? AND chain_data.old_start= c[1]: + data = self._cached_data[chrom][c] + idx = bisect.bisect(data, (start, sys.maxsize, sys.maxsize)) + if idx: + idx = idx - 1 + + if idx < len(data) - 1 and start == data[idx + 1]: + idx = idx + 1 + + while idx < len(data) and data[idx][0] < end: + yield ( + c[-1], + data[idx][0], + data[idx][1], + data[idx][2], + c[4], + c[5], + ) + idx = idx + 1 + + def liftRegion(self, chrom, start, end): + """ + Lifts a genomic region from old_ucschg to new_ucschg assembly. + + Parameters: + ----------- + chrom : str + Chromosome name or identifier. + start : int + Start position of the region. + end : int + End position of the region. + + Returns: + -------- + tuple or None: + Mapped region (new_chr, new_start, new_end) or None if unable to map. + + Notes: + ------ + This method uses chain data to map the specified genomic region from + the old_ucschg assembly to the new_ucschg assembly. + """ + # We need to actually lift regions to detect dropped sections + is_region = True + + # If the start and end are swapped, reverse them, please + if start > end: + (start, end) = (end, start) + elif start == end: + is_region = False + end = start + 1 + + ch_list = self._findChains(chrom, start, end) + + # This will be a tuple of (start, end) of the mapped region + # If the function returns "None", then it was unable to map + # the region into the new assembly + mapped_reg = None + + curr_chain = None + + total_mapped_sz = 0 + first_seg = None + end_seg = None + for seg in ch_list: + if curr_chain is None: + curr_chain = seg[0] + first_seg = seg + end_seg = seg + total_mapped_sz = seg[2] - seg[1] + elif seg[0] != curr_chain: + mapped_reg = self._mapRegion( + (start, end), first_seg, end_seg, total_mapped_sz + ) + if not mapped_reg: + first_seg = seg + end_seg = seg + total_mapped_sz = seg[2] - seg[1] + else: + break + else: + end_seg = seg + total_mapped_sz = total_mapped_sz + seg[2] - seg[1] + + if not mapped_reg and first_seg is not None: + mapped_reg = self._mapRegion( + (start, end), first_seg, end_seg, total_mapped_sz + ) + + if mapped_reg and not is_region: + mapped_reg = (mapped_reg[0], mapped_reg[1], mapped_reg[1]) # bug? + + return mapped_reg + + def _mapRegion(self, region, first_seg, end_seg, total_mapped_sz): + """ + Maps a region using chain segment data. + + Parameters: + ----------- + region : tuple + Genomic region (start, end) to map. + first_seg : tuple + First segment of the chain (chain_id, old_start, old_end, new_start, is_fwd, new_chr). + end_seg : tuple + Last segment of the chain (chain_id, old_start, old_end, new_start, is_fwd, new_chr). + total_mapped_sz : int + Total size of mapped segments. + + Returns: + -------- + tuple or None: + Mapped region (new_chr, new_start, new_end) or None if unable to map. + + Notes: + ------ + This method calculates the mapped region based on the chain segments + and verifies if the mapped fraction meets the minimum required. + """ + mapped_reg = None + + # The front and end differences are the distances from the + # beginning of the segment. + + # The front difference should be >= 0 and <= size of 1st segment + front_diff = max(0, min(region[0] - first_seg[1], first_seg[2] - first_seg[1])) + + # The end difference should be similar, but w/ last + end_diff = max(0, min(region[1] - end_seg[1], end_seg[2] - end_seg[1])) + + # Now, if we are moving forward, we add the difference + # to the new_start, backward, we subtract + # Also, at this point, if backward, swap start/end + if first_seg[4]: + new_start = first_seg[3] + front_diff + new_end = end_seg[3] + end_diff + else: + new_start = end_seg[3] - end_diff + new_end = first_seg[3] - front_diff + + # old_startHere, detect if we have mapped a sufficient fraction + # of the region. liftOver uses a default of 95% + mapped_size = ( + total_mapped_sz - front_diff - (end_seg[2] - end_seg[1]) + end_diff + ) + + if mapped_size / float(region[1] - region[0]) >= self._minFrac: + mapped_reg = (first_seg[5], new_start, new_end) + + return mapped_reg + if __name__ == "__main__": - from loki import loki_db - - if len(sys.argv) < 5: - print("usage: %s [oldhg=19] [newhg=38]" % (sys.argv[0],)) - sys.exit(2) - - db = loki_db.Database(sys.argv[2]) - - old = int(sys.argv[5]) if (len(sys.argv) > 5) else 19 - new = int(sys.argv[6]) if (len(sys.argv) > 6) else 38 - #lo = liftOver(db, old, new, False) - f = (sys.stdin if (sys.argv[1] == '-') else open(sys.argv[1],'r')) - m = (sys.stdout if (sys.argv[3] == '-') else open(sys.argv[3],'w')) - u = (sys.stderr if (sys.argv[4] == '-') else open(sys.argv[4],'w')) - - def generateInputs(f): - """ - Generates input data for liftOver region conversion. - - Parameters: - ----------- - f : file object - Input file object containing genomic coordinates. - - Yields: - ------ - tuple: - Tuple containing processed genomic region information: - (formatted_line, chromosome_number, start_position, end_position, None). - - Notes: - ------ - This function reads lines from the input file object 'f', processes - genomic coordinates, replaces spaces and tabs with colons, adjusts - chromosome names, and retrieves chromosome numbers from 'db'. - """ - for l in f: - wds = l.split() - if wds[0].lower().startswith('chr'): - wds[0] = wds[0][3:] - yield (l.strip().replace(" ",":").replace("\t",":"), db.chr_num.get(wds[0],-1), int(wds[1]), int(wds[2]), None) - - def errorCallback(r): - """ - Error callback function for handling liftOver errors. - - Parameters: - ----------- - r : tuple - Tuple containing error details to be processed. - - Notes: - ------ - This function prints the error details to the stderr stream 'u' - in a tab-separated format. - """ - print("\t".join(str(c) for c in r), end="", file=u) - - for r in db.generateLiftOverRegions(old, new, generateInputs(f), errorCallback=errorCallback): - print("chr%s\t%s\t%d\t%d" % (db.chr_name.get(r[1],r[1]), r[0], r[2], r[3]), end="", file=m) + from loki import loki_db + + if len(sys.argv) < 5: + print( + "usage: %s [oldhg=19] [newhg=38]" + % (sys.argv[0],) + ) + sys.exit(2) + + db = loki_db.Database(sys.argv[2]) + + old = int(sys.argv[5]) if (len(sys.argv) > 5) else 19 + new = int(sys.argv[6]) if (len(sys.argv) > 6) else 38 + # lo = liftOver(db, old, new, False) + f = sys.stdin if (sys.argv[1] == "-") else open(sys.argv[1], "r") + m = sys.stdout if (sys.argv[3] == "-") else open(sys.argv[3], "w") + u = sys.stderr if (sys.argv[4] == "-") else open(sys.argv[4], "w") + + def generateInputs(f): + """ + Generates input data for liftOver region conversion. + + Parameters: + ----------- + f : file object + Input file object containing genomic coordinates. + + Yields: + ------ + tuple: + Tuple containing processed genomic region information: + (formatted_line, chromosome_number, start_position, end_position, None). + + Notes: + ------ + This function reads lines from the input file object 'f', processes + genomic coordinates, replaces spaces and tabs with colons, adjusts + chromosome names, and retrieves chromosome numbers from 'db'. + """ + for l in f: + wds = l.split() + if wds[0].lower().startswith("chr"): + wds[0] = wds[0][3:] + yield ( + l.strip().replace(" ", ":").replace("\t", ":"), + db.chr_num.get(wds[0], -1), + int(wds[1]), + int(wds[2]), + None, + ) + + def errorCallback(r): + """ + Error callback function for handling liftOver errors. + + Parameters: + ----------- + r : tuple + Tuple containing error details to be processed. + + Notes: + ------ + This function prints the error details to the stderr stream 'u' + in a tab-separated format. + """ + print("\t".join(str(c) for c in r), end="", file=u) + + for r in db.generateLiftOverRegions( + old, new, generateInputs(f), errorCallback=errorCallback + ): + print( + "chr%s\t%s\t%d\t%d" % (db.chr_name.get(r[1], r[1]), r[0], r[2], r[3]), + end="", + file=m, + ) diff --git a/setup.py b/setup.py index a2c6999..941d097 100644 --- a/setup.py +++ b/setup.py @@ -20,21 +20,12 @@ import setuptools setuptools.setup( - name='LOKI', - version='3.0.0', - author='Ritchie Lab', - author_email='Software_RitchieLab@pennmedicine.upenn.edu', - url='https://ritchielab.org', - scripts=[ - 'loki-build.py' - ], - packages=[ - 'loki', - 'loki.loaders', - 'loki.loaders.test', - 'loki.util' - ], - data_files=[ - ('', ['CHANGELOG']) - ] + name="LOKI", + version="3.0.0", + author="Ritchie Lab", + author_email="Software_RitchieLab@pennmedicine.upenn.edu", + url="https://ritchielab.org", + scripts=["loki-build.py"], + packages=["loki", "loki.loaders", "loki.loaders.test", "loki.util"], + data_files=[("", ["CHANGELOG"])], ) diff --git a/unsupported_loaders/loki_source_disgenet.py b/unsupported_loaders/loki_source_disgenet.py index 5a86f78..0ac5110 100644 --- a/unsupported_loaders/loki_source_disgenet.py +++ b/unsupported_loaders/loki_source_disgenet.py @@ -8,96 +8,125 @@ class Source_disgenet(loki_source.Source): - - - @classmethod - def getVersionString(cls): - return '1.0 (2023-08-08)' - #getVersionString() - - - def download(self, options): - # download the latest source files - self.downloadFilesFromHTTP('disgenet.org', { - 'disgenet_2020.db.gz': '/static/disgenet_ap1/files/sqlite_downloads/current/disgenet_2020.db.gz', - }) - - return ['disgenet_2020.db.gz'] - #download() - - - def update(self, options): - # clear out all old data from this source - self.log("deleting old records from the database ...") - self.deleteAll() - self.log(" OK\n") - - # get or create the required metadata records - namespaceID = self.addNamespaces([ - ('disgenet_id', 0), - ('entrez_gid', 0), - ('disease', 0) - ]) - typeID = self.addTypes([ - ('disease',), - ('gene',), - ]) - subtypeID = self.addSubtypes([ - ('-',), - ]) - - # process disgenet sqlite file - self.log("processing diseases ...") - gunzip('disgenet_2020.db.gz') - diseases = {} - diseaseClass = {} - con = apsw.Connection('disgenet_2020.db') - cur = con.cursor() - comm = 'select diseaseClassNID,diseaseClassName from diseaseClass' - cur.execute(comm) - diseaseClass = {diseaseclass[0]:diseaseclass[1].strip() for diseaseclass in cur.fetchall()} - comm = 'SELECT a.diseaseId,a.diseaseName,b.diseaseClassNID FROM diseaseAttributes a LEFT JOIN disease2class b ON a.diseaseNID=b.diseaseNID order by a.diseaseNID' - cur.execute(comm) - diseases = {disease[0]:[disease[1],disease[2]] for disease in cur.fetchall()} - #foreach line in diseaseFile - self.log(" OK: %d disease\n" % (len(diseases),)) - - # store diseases - self.log("writing diseases to the database ...") - listSubtype = self.addSubtypes([(val,)for val in set(diseaseClass.values())]) - listGroup = diseases.keys() - listAID = self.addTypedGroups(typeID['disease'], ((subtypeID['-'] if diseases[diseaseID][1] is None else listSubtype[diseaseClass[diseases[diseaseID][1]]],diseases[diseaseID][0],None) for diseaseID in listGroup)) - groupAID = dict(zip(listGroup,listAID)) - self.log(" OK\n") - - # store diseases names - self.log("writing diseases names to the database ...") - self.addGroupNamespacedNames(namespaceID['disgenet_id'], ((groupAID[diseaseID],diseaseID) for diseaseID in listGroup)) - self.addGroupNamespacedNames(namespaceID['disease'], ((groupAID[diseaseID],diseases[diseaseID][0]) for diseaseID in listGroup)) - diseases = None - diseaseClass = None - self.log(" OK\n") - - # process disgenet disease identifiers - self.log("processing diseases identifiers ...") - diseaseGene = set() - comm = 'SELECT b.geneId,c.diseaseId FROM geneDiseaseNetwork a LEFT JOIN geneAttributes b ON a.geneNID=b.geneNID LEFT JOIN diseaseAttributes c ON a.diseaseNID=c.diseaseNID ORDER BY c.diseaseId' - cur.execute(comm) - diseaseGeneResult = cur.fetchall() - con.close() - numAssoc = 0 - for pair in diseaseGeneResult: - if pair[1] in listGroup: - numAssoc += 1 - diseaseGene.add( (groupAID[pair[1]],numAssoc,pair[0]) ) - self.log(" OK: %d diseases and gene pairs\n" % (len(diseaseGene),)) - - # store gaad disease identifiers - self.log("writing diseases and gene pairs to the database ...") - self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID['entrez_gid'], diseaseGene) - diseaseGene = None - self.log(" OK\n") - - #update() - -#Source_go + + @classmethod + def getVersionString(cls): + return "1.0 (2023-08-08)" + + # getVersionString() + + def download(self, options): + # download the latest source files + self.downloadFilesFromHTTP( + "disgenet.org", + { + "disgenet_2020.db.gz": "/static/disgenet_ap1/files/sqlite_downloads/current/disgenet_2020.db.gz", + }, + ) + + return ["disgenet_2020.db.gz"] + + # download() + + def update(self, options): + # clear out all old data from this source + self.log("deleting old records from the database ...") + self.deleteAll() + self.log(" OK\n") + + # get or create the required metadata records + namespaceID = self.addNamespaces( + [("disgenet_id", 0), ("entrez_gid", 0), ("disease", 0)] + ) + typeID = self.addTypes( + [ + ("disease",), + ("gene",), + ] + ) + subtypeID = self.addSubtypes( + [ + ("-",), + ] + ) + + # process disgenet sqlite file + self.log("processing diseases ...") + gunzip("disgenet_2020.db.gz") + diseases = {} + diseaseClass = {} + con = apsw.Connection("disgenet_2020.db") + cur = con.cursor() + comm = "select diseaseClassNID,diseaseClassName from diseaseClass" + cur.execute(comm) + diseaseClass = { + diseaseclass[0]: diseaseclass[1].strip() for diseaseclass in cur.fetchall() + } + comm = "SELECT a.diseaseId,a.diseaseName,b.diseaseClassNID FROM diseaseAttributes a LEFT JOIN disease2class b ON a.diseaseNID=b.diseaseNID order by a.diseaseNID" + cur.execute(comm) + diseases = {disease[0]: [disease[1], disease[2]] for disease in cur.fetchall()} + # foreach line in diseaseFile + self.log(" OK: %d disease\n" % (len(diseases),)) + + # store diseases + self.log("writing diseases to the database ...") + listSubtype = self.addSubtypes([(val,) for val in set(diseaseClass.values())]) + listGroup = diseases.keys() + listAID = self.addTypedGroups( + typeID["disease"], + ( + ( + ( + subtypeID["-"] + if diseases[diseaseID][1] is None + else listSubtype[diseaseClass[diseases[diseaseID][1]]] + ), + diseases[diseaseID][0], + None, + ) + for diseaseID in listGroup + ), + ) + groupAID = dict(zip(listGroup, listAID)) + self.log(" OK\n") + + # store diseases names + self.log("writing diseases names to the database ...") + self.addGroupNamespacedNames( + namespaceID["disgenet_id"], + ((groupAID[diseaseID], diseaseID) for diseaseID in listGroup), + ) + self.addGroupNamespacedNames( + namespaceID["disease"], + ((groupAID[diseaseID], diseases[diseaseID][0]) for diseaseID in listGroup), + ) + diseases = None + diseaseClass = None + self.log(" OK\n") + + # process disgenet disease identifiers + self.log("processing diseases identifiers ...") + diseaseGene = set() + comm = "SELECT b.geneId,c.diseaseId FROM geneDiseaseNetwork a LEFT JOIN geneAttributes b ON a.geneNID=b.geneNID LEFT JOIN diseaseAttributes c ON a.diseaseNID=c.diseaseNID ORDER BY c.diseaseId" + cur.execute(comm) + diseaseGeneResult = cur.fetchall() + con.close() + numAssoc = 0 + for pair in diseaseGeneResult: + if pair[1] in listGroup: + numAssoc += 1 + diseaseGene.add((groupAID[pair[1]], numAssoc, pair[0])) + self.log(" OK: %d diseases and gene pairs\n" % (len(diseaseGene),)) + + # store gaad disease identifiers + self.log("writing diseases and gene pairs to the database ...") + self.addGroupMemberTypedNamespacedNames( + typeID["gene"], namespaceID["entrez_gid"], diseaseGene + ) + diseaseGene = None + self.log(" OK\n") + + # update() + + +# Source_go diff --git a/unsupported_loaders/loki_source_gaad.py b/unsupported_loaders/loki_source_gaad.py index b3f9d90..211dd0d 100644 --- a/unsupported_loaders/loki_source_gaad.py +++ b/unsupported_loaders/loki_source_gaad.py @@ -6,151 +6,174 @@ class Source_gaad(loki_source.Source): - - - @classmethod - def getVersionString(cls): - return '1.0 (2023-06-08)' - #getVersionString() - - - def download(self, options): - # download the latest source files - self.downloadFilesFromHTTPS('gaad.medgenius.info', { - 'diseases2.txt.gz': '/Downloads/diseases2.txt.gz', # disease name by AID - 'disease_relationships.txt.gz': '/Downloads/disease_relationships.txt.gz', - 'disease_association_database_annotations_uniprot_ncbiGene.txt.gz': '/Downloads/disease_association_database_annotations_uniprot_ncbiGene.txt.gz', - 'disease_association_genecards.txt.gz': '/Downloads/disease_association_genecards.txt.gz', - 'disease_gene_association_pubmed_textmining_zhao.txt.gz': '/Downloads/disease_gene_association_pubmed_textmining_zhao.txt.gz', - }) - - return [ - 'diseases2.txt.gz', - 'disease_relationships.txt.gz', - 'disease_association_database_annotations_uniprot_ncbiGene.txt.gz', - 'disease_association_genecards.txt.gz', - 'disease_gene_association_pubmed_textmining_zhao.txt.gz' - ] - #download() - - - def update(self, options): - # clear out all old data from this source - self.log("deleting old records from the database ...") - self.deleteAll() - self.log(" OK\n") - - # get or create the required metadata records - namespaceID = self.addNamespaces([ - ('gaad_id', 0), - ('entrez_gid', 0), - ('disease', 0) - ]) - relationshipID = self.addRelationships([ - ('disease_co-occurring',), - ]) - typeID = self.addTypes([ - ('disease',), - ('gene',), - ]) - subtypeID = self.addSubtypes([ - ('-',), - ]) - - # process gaad disease - self.log("processing diseases ...") - diseaseFile = self.zfile('diseases2.txt.gz') - diseases = {} - for line in diseaseFile: - if not line.startswith("AID"): - continue - words = line.split("\t") - diseaseID = words[0] - name = words[1].rstrip() - # store disease name of each disease ID (AID) - diseases[diseaseID] = name - #foreach line in diseaseFile - self.log(" OK: %d disease\n" % (len(diseases),)) - - # store diseases - self.log("writing diseases to the database ...") - listGroup = diseases.keys() - listAID = self.addTypedGroups(typeID['disease'], ((subtypeID['-'],group,diseases[group]) for group in listGroup)) - groupAID = dict(zip(listGroup,listAID)) - self.log(" OK\n") - - # store diseases names - self.log("writing diseases names to the database ...") - self.addGroupNamespacedNames(namespaceID['gaad_id'], ((groupAID[group],group) for group in listGroup)) - self.addGroupNamespacedNames(namespaceID['disease'], ((groupAID[group],diseases[group]) for group in listGroup)) - diseases = None - self.log(" OK\n") - - # process gaad disease relationships - self.log("processing diseases relationships ...") - relationshipFile = self.zfile('disease_relationships.txt.gz') - relationships = [] - num = 0 - for line in relationshipFile: - if line.startswith("disease_uid1"): - continue - words = line.split("\t") - diseaseID = words[0] - diseaseID2 = words[1] - # store disease pairs that shares genes - relationships.append( (diseaseID,diseaseID2,relationshipID['disease_co-occurring'],None) ) - num+=1 - #foreach line in diseaseFile - self.log(" OK: %d disease relationships\n" % (num,)) - - # store gaad disease relationships - self.log("writing diseases relationships to the database ...") - self.addGroupRelationships(relationships) - relationships = None - self.log(" OK\n") - - # process gaad disease identifiers - self.log("processing diseases identifiers ...") - ncbiFile = self.zfile('disease_association_database_annotations_uniprot_ncbiGene.txt.gz') - genecardsFile = self.zfile('disease_association_genecards.txt.gz') - pubmedFile = self.zfile('disease_gene_association_pubmed_textmining_zhao.txt.gz') - diseaseGene = [] - num = 0 - for line in ncbiFile: - if line.startswith("disease_"): - continue - words = line.split("\t") - diseaseID = words[0].strip() - entrezID = words[1].strip() - num+=1 - diseaseGene.append((groupAID[diseaseID], num, entrezID)) - #foreach line in ncbiFile: - for line in genecardsFile: - if line.startswith("disease_"): - continue - words = line.split("\t") - diseaseID = words[0].strip() - entrezID = words[1].strip() - num+=1 - diseaseGene.append((groupAID[diseaseID], num, entrezID)) - #foreach line in genecardsFile: - for line in pubmedFile: - if line.startswith("disease_"): - continue - words = line.split("\t") - diseaseID = words[2].strip() - entrezID = words[1].strip() - num+=1 - diseaseGene.append((groupAID[diseaseID], num, entrezID)) - #foreach line in pubmedFile: - self.log(" OK: %d diseases and gene pairs\n" % (len(diseaseGene),)) - - # store gaad disease identifiers - self.log("writing diseases and gene pairs to the database ...") - self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID['entrez_gid'], diseaseGene) - diseaseGene = None - self.log(" OK\n") - - #update() - -#Source_go + + @classmethod + def getVersionString(cls): + return "1.0 (2023-06-08)" + + # getVersionString() + + def download(self, options): + # download the latest source files + self.downloadFilesFromHTTPS( + "gaad.medgenius.info", + { + "diseases2.txt.gz": "/Downloads/diseases2.txt.gz", # disease name by AID + "disease_relationships.txt.gz": "/Downloads/disease_relationships.txt.gz", + "disease_association_database_annotations_uniprot_ncbiGene.txt.gz": "/Downloads/disease_association_database_annotations_uniprot_ncbiGene.txt.gz", + "disease_association_genecards.txt.gz": "/Downloads/disease_association_genecards.txt.gz", + "disease_gene_association_pubmed_textmining_zhao.txt.gz": "/Downloads/disease_gene_association_pubmed_textmining_zhao.txt.gz", + }, + ) + + return [ + "diseases2.txt.gz", + "disease_relationships.txt.gz", + "disease_association_database_annotations_uniprot_ncbiGene.txt.gz", + "disease_association_genecards.txt.gz", + "disease_gene_association_pubmed_textmining_zhao.txt.gz", + ] + + # download() + + def update(self, options): + # clear out all old data from this source + self.log("deleting old records from the database ...") + self.deleteAll() + self.log(" OK\n") + + # get or create the required metadata records + namespaceID = self.addNamespaces( + [("gaad_id", 0), ("entrez_gid", 0), ("disease", 0)] + ) + relationshipID = self.addRelationships( + [ + ("disease_co-occurring",), + ] + ) + typeID = self.addTypes( + [ + ("disease",), + ("gene",), + ] + ) + subtypeID = self.addSubtypes( + [ + ("-",), + ] + ) + + # process gaad disease + self.log("processing diseases ...") + diseaseFile = self.zfile("diseases2.txt.gz") + diseases = {} + for line in diseaseFile: + if not line.startswith("AID"): + continue + words = line.split("\t") + diseaseID = words[0] + name = words[1].rstrip() + # store disease name of each disease ID (AID) + diseases[diseaseID] = name + # foreach line in diseaseFile + self.log(" OK: %d disease\n" % (len(diseases),)) + + # store diseases + self.log("writing diseases to the database ...") + listGroup = diseases.keys() + listAID = self.addTypedGroups( + typeID["disease"], + ((subtypeID["-"], group, diseases[group]) for group in listGroup), + ) + groupAID = dict(zip(listGroup, listAID)) + self.log(" OK\n") + + # store diseases names + self.log("writing diseases names to the database ...") + self.addGroupNamespacedNames( + namespaceID["gaad_id"], ((groupAID[group], group) for group in listGroup) + ) + self.addGroupNamespacedNames( + namespaceID["disease"], + ((groupAID[group], diseases[group]) for group in listGroup), + ) + diseases = None + self.log(" OK\n") + + # process gaad disease relationships + self.log("processing diseases relationships ...") + relationshipFile = self.zfile("disease_relationships.txt.gz") + relationships = [] + num = 0 + for line in relationshipFile: + if line.startswith("disease_uid1"): + continue + words = line.split("\t") + diseaseID = words[0] + diseaseID2 = words[1] + # store disease pairs that shares genes + relationships.append( + (diseaseID, diseaseID2, relationshipID["disease_co-occurring"], None) + ) + num += 1 + # foreach line in diseaseFile + self.log(" OK: %d disease relationships\n" % (num,)) + + # store gaad disease relationships + self.log("writing diseases relationships to the database ...") + self.addGroupRelationships(relationships) + relationships = None + self.log(" OK\n") + + # process gaad disease identifiers + self.log("processing diseases identifiers ...") + ncbiFile = self.zfile( + "disease_association_database_annotations_uniprot_ncbiGene.txt.gz" + ) + genecardsFile = self.zfile("disease_association_genecards.txt.gz") + pubmedFile = self.zfile( + "disease_gene_association_pubmed_textmining_zhao.txt.gz" + ) + diseaseGene = [] + num = 0 + for line in ncbiFile: + if line.startswith("disease_"): + continue + words = line.split("\t") + diseaseID = words[0].strip() + entrezID = words[1].strip() + num += 1 + diseaseGene.append((groupAID[diseaseID], num, entrezID)) + # foreach line in ncbiFile: + for line in genecardsFile: + if line.startswith("disease_"): + continue + words = line.split("\t") + diseaseID = words[0].strip() + entrezID = words[1].strip() + num += 1 + diseaseGene.append((groupAID[diseaseID], num, entrezID)) + # foreach line in genecardsFile: + for line in pubmedFile: + if line.startswith("disease_"): + continue + words = line.split("\t") + diseaseID = words[2].strip() + entrezID = words[1].strip() + num += 1 + diseaseGene.append((groupAID[diseaseID], num, entrezID)) + # foreach line in pubmedFile: + self.log(" OK: %d diseases and gene pairs\n" % (len(diseaseGene),)) + + # store gaad disease identifiers + self.log("writing diseases and gene pairs to the database ...") + self.addGroupMemberTypedNamespacedNames( + typeID["gene"], namespaceID["entrez_gid"], diseaseGene + ) + diseaseGene = None + self.log(" OK\n") + + # update() + + +# Source_go diff --git a/unsupported_loaders/loki_source_kegg.py b/unsupported_loaders/loki_source_kegg.py index 1d3f458..2fe637a 100644 --- a/unsupported_loaders/loki_source_kegg.py +++ b/unsupported_loaders/loki_source_kegg.py @@ -6,229 +6,268 @@ class Source_kegg(loki_source.Source): - - - @classmethod - def getVersionString(cls): - return '2.0 (2013-02-14)' - #getVersionString() - - - @classmethod - def getOptions(cls): - return { - 'api': '[rest|cache] -- use the new REST API, or a local file cache (default: rest)' - } - #getOptions() - - - def validateOptions(self, options): - for o,v in options.items(): - if o == 'api': - v = v.strip().lower() - if 'rest'.startswith(v): - v = 'rest' - elif 'cache'.startswith(v): - v = 'cache' - else: - return "api must be 'rest', or 'cache'" - options[o] = v - else: - return "unexpected option '%s'" % o - return True - #validateOptions() - - - def download(self, options): - if (options.get('api') == 'cache'): - # do nothing, update() will just expect the files to already be there - pass - else: # api==rest - self.downloadFilesFromHTTP('rest.kegg.jp', { - 'list-pathway-hsa': '/list/pathway/hsa', - 'link-pathway-hsa': '/link/pathway/hsa', - 'list-disease': '/list/disease', - 'link-disease-hsa': '/link/disease/hsa', - 'category-pathway': '/get/br:br08901/json', - 'category-disease': '/get/br:br08403/json', - }) - #if api==rest/cache - - return [ - 'list-pathway-hsa', - 'link-pathway-hsa', - 'list-disease', - 'link-disease-hsa', - 'category-pathway', - 'category-disease' - ] - #download() - - - def update(self, options): - # clear out all old data from this source - self.log("deleting old records from the database ...") - self.deleteAll() - self.log(" OK\n") - - # get or create the required metadata records - namespaceID = self.addNamespaces([ - ('kegg_id', 0), - ('pathway', 0), - ('entrez_gid', 0), - ('disease', 0) - ]) - typeID = self.addTypes([ - ('pathway',), - ('gene',), - ('disease',), - ]) - - # process pathways - self.log("processing pathways ...") - #read pathway categories json file into pathCategory - pathCategory = [] - with open(r'category-pathway') as pathCategoryFile: - pathCategory = json.load(pathCategoryFile) - #store subtypes into pathSubtype - pathSubtype = {} - for category in pathCategory['children']: - for category2 in category['children']: - if category2['name']=='Global and overview maps' or category2['name']=='Carbohydrate metabolism' or category2['name']=='Energy metabolism' or category2['name']=='Immune system' or category2['name']=='Endocrine system': - continue - for category3 in category2['children']: - line = category3['name'].split(" ") - pathID = "hsa"+line[0] - pathSubtype[pathID] = category2['name'] - pathCategory = None - #with pathCategory - pathName = {} - with open('list-pathway-hsa','r') as pathFile: - for line in pathFile: - words = line.split("\t") - pathID = words[0] - if pathID not in pathSubtype: - pathSubtype[pathID] = "-" - name = words[1].rstrip() - if name.endswith(" - Homo sapiens (human)"): - name = name[:-23] - pathName[pathID] = name - #foreach line in pathFile - #with pathFile - self.log(" OK: %d pathways\n" % (len(pathName),)) - - # store pathways - self.log("writing pathways to the database ...") - listPath = pathName.keys() - listSubtype = self.addSubtypes([(val,)for val in set(pathSubtype.values())]) - listGID = self.addTypedGroups(typeID['pathway'], ((listSubtype[pathSubtype[pathID]],pathName[pathID],None) for pathID in listPath)) - pathGID = dict(zip(listPath,listGID)) - self.log(" OK\n") - - # store pathway names - self.log("writing pathway names to the database ...") - self.addGroupNamespacedNames(namespaceID['kegg_id'], ((pathGID[pathID],pathID) for pathID in listPath)) - self.addGroupNamespacedNames(namespaceID['pathway'], ((pathGID[pathID],pathName[pathID]) for pathID in listPath)) - self.log(" OK\n") - pathName = None - listPath = None - - # process associations - self.log("processing pathway gene associations ...") - entrezAssoc = set() - numAssoc = 0 - with open('link-pathway-hsa','r') as assocFile: - for line in assocFile: - words = line.split("\t") - hsaGene = words[0] - pathID = words[1].strip().replace("path:hsa","hsa") - if pathID in pathGID: - numAssoc += 1 - entrezAssoc.add( (pathGID[pathID],numAssoc,hsaGene[4:]) ) - #if pathway and gene are ok - #foreach line in assocFile - #with assocFile - self.log(" OK: %d associations\n" % (numAssoc,)) - listSubtype = None - pathGID = None - - # store gene associations - self.log("writing gene associations to the database ...") - self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID['entrez_gid'], entrezAssoc) - self.log(" OK\n") - entrezAssoc = None - - # process diseases - self.log("processing diseases ...") - #read disease categories json file into diseaseCategory - diseaseCategory = [] - with open(r'category-disease') as diseaseCategoryFile: - diseaseCategory = json.load(diseaseCategoryFile) - #store subtypes into diseaseSubtype - diseaseSubtype = {} - for category in diseaseCategory['children']: - for category2 in category['children']: - if 'children' not in category2: - continue - for category3 in category2['children']: - if 'children' not in category3: - continue - for category4 in category3['children']: - line = category4['name'] - if not line.startswith("H"): - continue; - diseaseID = line.split(" ")[0] - diseaseSubtype[diseaseID] = category2['name'] - diseaseCategory = None - #with diseaseCategory - diseaseName = {} - with open('list-disease','r') as pathFile: - for line in pathFile: - words = line.split("\t") - pathID = words[0] - if pathID not in diseaseSubtype: - diseaseSubtype[pathID] = "-" - name = words[1].rstrip() - diseaseName[pathID] = name - #foreach line in diseaseFile - #with diseaseFile - self.log(" OK: %d diseases\n" % (len(diseaseName),)) - - # store diseases - self.log("writing diseases to the database ...") - listDisease = diseaseName.keys() - listSubtype = self.addSubtypes([(val,)for val in set(diseaseSubtype.values())]) - listGID = self.addTypedGroups(typeID['disease'], ((listSubtype[diseaseSubtype[diseaseID]],diseaseName[diseaseID],None) for diseaseID in listDisease)) - diseaseGID = dict(zip(listDisease,listGID)) - self.log(" OK\n") - - # store disease names - self.log("writing disease names to the database ...") - self.addGroupNamespacedNames(namespaceID['kegg_id'], ((diseaseGID[diseaseID],diseaseID) for diseaseID in listDisease)) - self.addGroupNamespacedNames(namespaceID['disease'], ((diseaseGID[diseaseID],diseaseName[diseaseID]) for diseaseID in listDisease)) - self.log(" OK\n") - - # process disease & gene associations - self.log("processing disease gene associations ...") - entrezAssoc = set() - numAssoc = 0 - with open('link-disease-hsa','r') as assocFile: - for line in assocFile: - words = line.split("\t") - hsaGene = words[0] - diseaseID = words[1].strip()[3:] - if diseaseID in diseaseGID: - numAssoc += 1 - entrezAssoc.add( (diseaseGID[diseaseID],numAssoc,hsaGene[4:]) ) - #foreach line in assocFile - #with assocFile - self.log(" OK: %d associations\n" % (numAssoc,)) - - # store gene associations - self.log("writing gene associations to the database ...") - self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID['entrez_gid'], entrezAssoc) - self.log(" OK\n") - entrezAssoc = None - #update() - -#Source_kegg + + @classmethod + def getVersionString(cls): + return "2.0 (2013-02-14)" + + # getVersionString() + + @classmethod + def getOptions(cls): + return { + "api": "[rest|cache] -- use the new REST API, or a local file cache (default: rest)" + } + + # getOptions() + + def validateOptions(self, options): + for o, v in options.items(): + if o == "api": + v = v.strip().lower() + if "rest".startswith(v): + v = "rest" + elif "cache".startswith(v): + v = "cache" + else: + return "api must be 'rest', or 'cache'" + options[o] = v + else: + return "unexpected option '%s'" % o + return True + + # validateOptions() + + def download(self, options): + if options.get("api") == "cache": + # do nothing, update() will just expect the files to already be there + pass + else: # api==rest + self.downloadFilesFromHTTP( + "rest.kegg.jp", + { + "list-pathway-hsa": "/list/pathway/hsa", + "link-pathway-hsa": "/link/pathway/hsa", + "list-disease": "/list/disease", + "link-disease-hsa": "/link/disease/hsa", + "category-pathway": "/get/br:br08901/json", + "category-disease": "/get/br:br08403/json", + }, + ) + # if api==rest/cache + + return [ + "list-pathway-hsa", + "link-pathway-hsa", + "list-disease", + "link-disease-hsa", + "category-pathway", + "category-disease", + ] + + # download() + + def update(self, options): + # clear out all old data from this source + self.log("deleting old records from the database ...") + self.deleteAll() + self.log(" OK\n") + + # get or create the required metadata records + namespaceID = self.addNamespaces( + [("kegg_id", 0), ("pathway", 0), ("entrez_gid", 0), ("disease", 0)] + ) + typeID = self.addTypes( + [ + ("pathway",), + ("gene",), + ("disease",), + ] + ) + + # process pathways + self.log("processing pathways ...") + # read pathway categories json file into pathCategory + pathCategory = [] + with open(r"category-pathway") as pathCategoryFile: + pathCategory = json.load(pathCategoryFile) + # store subtypes into pathSubtype + pathSubtype = {} + for category in pathCategory["children"]: + for category2 in category["children"]: + if ( + category2["name"] == "Global and overview maps" + or category2["name"] == "Carbohydrate metabolism" + or category2["name"] == "Energy metabolism" + or category2["name"] == "Immune system" + or category2["name"] == "Endocrine system" + ): + continue + for category3 in category2["children"]: + line = category3["name"].split(" ") + pathID = "hsa" + line[0] + pathSubtype[pathID] = category2["name"] + pathCategory = None + # with pathCategory + pathName = {} + with open("list-pathway-hsa", "r") as pathFile: + for line in pathFile: + words = line.split("\t") + pathID = words[0] + if pathID not in pathSubtype: + pathSubtype[pathID] = "-" + name = words[1].rstrip() + if name.endswith(" - Homo sapiens (human)"): + name = name[:-23] + pathName[pathID] = name + # foreach line in pathFile + # with pathFile + self.log(" OK: %d pathways\n" % (len(pathName),)) + + # store pathways + self.log("writing pathways to the database ...") + listPath = pathName.keys() + listSubtype = self.addSubtypes([(val,) for val in set(pathSubtype.values())]) + listGID = self.addTypedGroups( + typeID["pathway"], + ( + (listSubtype[pathSubtype[pathID]], pathName[pathID], None) + for pathID in listPath + ), + ) + pathGID = dict(zip(listPath, listGID)) + self.log(" OK\n") + + # store pathway names + self.log("writing pathway names to the database ...") + self.addGroupNamespacedNames( + namespaceID["kegg_id"], ((pathGID[pathID], pathID) for pathID in listPath) + ) + self.addGroupNamespacedNames( + namespaceID["pathway"], + ((pathGID[pathID], pathName[pathID]) for pathID in listPath), + ) + self.log(" OK\n") + pathName = None + listPath = None + + # process associations + self.log("processing pathway gene associations ...") + entrezAssoc = set() + numAssoc = 0 + with open("link-pathway-hsa", "r") as assocFile: + for line in assocFile: + words = line.split("\t") + hsaGene = words[0] + pathID = words[1].strip().replace("path:hsa", "hsa") + if pathID in pathGID: + numAssoc += 1 + entrezAssoc.add((pathGID[pathID], numAssoc, hsaGene[4:])) + # if pathway and gene are ok + # foreach line in assocFile + # with assocFile + self.log(" OK: %d associations\n" % (numAssoc,)) + listSubtype = None + pathGID = None + + # store gene associations + self.log("writing gene associations to the database ...") + self.addGroupMemberTypedNamespacedNames( + typeID["gene"], namespaceID["entrez_gid"], entrezAssoc + ) + self.log(" OK\n") + entrezAssoc = None + + # process diseases + self.log("processing diseases ...") + # read disease categories json file into diseaseCategory + diseaseCategory = [] + with open(r"category-disease") as diseaseCategoryFile: + diseaseCategory = json.load(diseaseCategoryFile) + # store subtypes into diseaseSubtype + diseaseSubtype = {} + for category in diseaseCategory["children"]: + for category2 in category["children"]: + if "children" not in category2: + continue + for category3 in category2["children"]: + if "children" not in category3: + continue + for category4 in category3["children"]: + line = category4["name"] + if not line.startswith("H"): + continue + diseaseID = line.split(" ")[0] + diseaseSubtype[diseaseID] = category2["name"] + diseaseCategory = None + # with diseaseCategory + diseaseName = {} + with open("list-disease", "r") as pathFile: + for line in pathFile: + words = line.split("\t") + pathID = words[0] + if pathID not in diseaseSubtype: + diseaseSubtype[pathID] = "-" + name = words[1].rstrip() + diseaseName[pathID] = name + # foreach line in diseaseFile + # with diseaseFile + self.log(" OK: %d diseases\n" % (len(diseaseName),)) + + # store diseases + self.log("writing diseases to the database ...") + listDisease = diseaseName.keys() + listSubtype = self.addSubtypes([(val,) for val in set(diseaseSubtype.values())]) + listGID = self.addTypedGroups( + typeID["disease"], + ( + (listSubtype[diseaseSubtype[diseaseID]], diseaseName[diseaseID], None) + for diseaseID in listDisease + ), + ) + diseaseGID = dict(zip(listDisease, listGID)) + self.log(" OK\n") + + # store disease names + self.log("writing disease names to the database ...") + self.addGroupNamespacedNames( + namespaceID["kegg_id"], + ((diseaseGID[diseaseID], diseaseID) for diseaseID in listDisease), + ) + self.addGroupNamespacedNames( + namespaceID["disease"], + ( + (diseaseGID[diseaseID], diseaseName[diseaseID]) + for diseaseID in listDisease + ), + ) + self.log(" OK\n") + + # process disease & gene associations + self.log("processing disease gene associations ...") + entrezAssoc = set() + numAssoc = 0 + with open("link-disease-hsa", "r") as assocFile: + for line in assocFile: + words = line.split("\t") + hsaGene = words[0] + diseaseID = words[1].strip()[3:] + if diseaseID in diseaseGID: + numAssoc += 1 + entrezAssoc.add((diseaseGID[diseaseID], numAssoc, hsaGene[4:])) + # foreach line in assocFile + # with assocFile + self.log(" OK: %d associations\n" % (numAssoc,)) + + # store gene associations + self.log("writing gene associations to the database ...") + self.addGroupMemberTypedNamespacedNames( + typeID["gene"], namespaceID["entrez_gid"], entrezAssoc + ) + self.log(" OK\n") + entrezAssoc = None + + # update() + + +# Source_kegg diff --git a/unsupported_loaders/loki_source_netpath.py b/unsupported_loaders/loki_source_netpath.py index e50d56f..23e491d 100644 --- a/unsupported_loaders/loki_source_netpath.py +++ b/unsupported_loaders/loki_source_netpath.py @@ -5,109 +5,140 @@ class Source_netpath(loki_source.Source): - - - @classmethod - def getVersionString(cls): - return '2.0 (2013-02-14)' - #getVersionString() - - - def download(self, options): - # download the latest source files - self.downloadFilesFromHTTP('www.netpath.org', { - # 'NetPath_GeneReg_TSV.zip': '/data/batch/NetPath_GeneReg_TSV.zip', #Last-Modified: Fri, 31 Oct 2008 17:00:16 GMT - 'NetPath_GeneReg_TSV1.zip': '/data/batch/NetPath_GeneReg_TSV1.zip', #Last-Modified: Sat, 03 Sep 2011 10:07:03 GMT - }) - - return [ 'NetPath_GeneReg_TSV1.zip' ] - #download() - - - def update(self, options): - # clear out all old data from this source - self.log("deleting old records from the database ...") - self.deleteAll() - self.log(" OK\n") - - # get or create the required metadata records - namespaceID = self.addNamespaces([ - ('netpath_id', 0), - ('pathway', 0), - ('symbol', 0), - ('entrez_gid', 0), - ]) - typeID = self.addTypes([ - ('pathway',), - ('gene',), - ]) - subtypeID = self.addSubtypes([ - ('-',), - ]) - - # process pathways and associations - self.log("verifying archive ...") - pathName = {} - nsAssoc = { - 'symbol' : set(), - 'entrez_gid' : set(), - } - numAssoc = 0 - with zipfile.ZipFile('NetPath_GeneReg_TSV1.zip','r') as pathZip: - err = pathZip.testzip() - if err: - self.log(" ERROR\n") - self.log("CRC failed for %s\n" % err) - return False - self.log(" OK\n") - self.log("processing pathways ...") - for info in pathZip.infolist(): - # there should be only one, but just in case.. - if info.filename == 'NetPath_Gene_regulation_all.txt': - pathFile = pathZip.open(info,'r') - header = pathFile.__next__().rstrip() - if not header.decode().startswith("Gene regulation id Pathway name Pathway ID Gene name Entrez gene ID"): # Regulation Experiment PubMed ID - self.log(" ERROR\n") - self.log("unrecognized file header in '%s': %s\n" % (info.filename,header)) - return False - for line in pathFile: - words = line.decode('latin-1').split("\t") - pathway = words[1] - pathID = words[2] - gene = words[3].strip() - entrezID = words[4] - - pathName[pathID] = pathway - numAssoc += 1 - nsAssoc['entrez_gid'].add( (pathID,numAssoc,entrezID) ) - nsAssoc['symbol'].add( (pathID,numAssoc,gene) ) - #foreach line in pathFile - pathFile.close() - #if file is the one we want - #foreach file in pathZip - #with pathZip - numPathways = len(pathName) - numID = sum(len(nsAssoc[ns]) for ns in nsAssoc) - self.log(" OK: %d pathways, %d associations (%d identifiers)\n" % (numPathways,numAssoc,numID)) - - # store pathways - self.log("writing pathways to the database ...") - listPath = pathName.keys() - listGID = self.addTypedGroups(typeID['pathway'], ((subtypeID['-'], pathName[pathID],None) for pathID in listPath)) - pathGID = dict(zip(listPath,listGID)) - self.log(" OK\n") - - # store pathway names - self.log("writing pathway names to the database ...") - self.addGroupNamespacedNames(namespaceID['netpath_id'], ((pathGID[pathID],pathID) for pathID in listPath)) - self.addGroupNamespacedNames(namespaceID['pathway'], ((pathGID[pathID],pathName[pathID]) for pathID in listPath)) - self.log(" OK\n") - - # store gene associations - self.log("writing gene associations to the database ...") - for ns in nsAssoc: - self.addGroupMemberTypedNamespacedNames(typeID['gene'], namespaceID[ns], ((pathGID[assoc[0]],assoc[1],assoc[2]) for assoc in nsAssoc[ns])) - self.log(" OK\n") - #update() - -#Source_netpath + + @classmethod + def getVersionString(cls): + return "2.0 (2013-02-14)" + + # getVersionString() + + def download(self, options): + # download the latest source files + self.downloadFilesFromHTTP( + "www.netpath.org", + { + # 'NetPath_GeneReg_TSV.zip': '/data/batch/NetPath_GeneReg_TSV.zip', #Last-Modified: Fri, 31 Oct 2008 17:00:16 GMT + "NetPath_GeneReg_TSV1.zip": "/data/batch/NetPath_GeneReg_TSV1.zip", # Last-Modified: Sat, 03 Sep 2011 10:07:03 GMT + }, + ) + + return ["NetPath_GeneReg_TSV1.zip"] + + # download() + + def update(self, options): + # clear out all old data from this source + self.log("deleting old records from the database ...") + self.deleteAll() + self.log(" OK\n") + + # get or create the required metadata records + namespaceID = self.addNamespaces( + [ + ("netpath_id", 0), + ("pathway", 0), + ("symbol", 0), + ("entrez_gid", 0), + ] + ) + typeID = self.addTypes( + [ + ("pathway",), + ("gene",), + ] + ) + subtypeID = self.addSubtypes( + [ + ("-",), + ] + ) + + # process pathways and associations + self.log("verifying archive ...") + pathName = {} + nsAssoc = { + "symbol": set(), + "entrez_gid": set(), + } + numAssoc = 0 + with zipfile.ZipFile("NetPath_GeneReg_TSV1.zip", "r") as pathZip: + err = pathZip.testzip() + if err: + self.log(" ERROR\n") + self.log("CRC failed for %s\n" % err) + return False + self.log(" OK\n") + self.log("processing pathways ...") + for info in pathZip.infolist(): + # there should be only one, but just in case.. + if info.filename == "NetPath_Gene_regulation_all.txt": + pathFile = pathZip.open(info, "r") + header = pathFile.__next__().rstrip() + if not header.decode().startswith( + "Gene regulation id Pathway name Pathway ID Gene name Entrez gene ID" + ): # Regulation Experiment PubMed ID + self.log(" ERROR\n") + self.log( + "unrecognized file header in '%s': %s\n" + % (info.filename, header) + ) + return False + for line in pathFile: + words = line.decode("latin-1").split("\t") + pathway = words[1] + pathID = words[2] + gene = words[3].strip() + entrezID = words[4] + + pathName[pathID] = pathway + numAssoc += 1 + nsAssoc["entrez_gid"].add((pathID, numAssoc, entrezID)) + nsAssoc["symbol"].add((pathID, numAssoc, gene)) + # foreach line in pathFile + pathFile.close() + # if file is the one we want + # foreach file in pathZip + # with pathZip + numPathways = len(pathName) + numID = sum(len(nsAssoc[ns]) for ns in nsAssoc) + self.log( + " OK: %d pathways, %d associations (%d identifiers)\n" + % (numPathways, numAssoc, numID) + ) + + # store pathways + self.log("writing pathways to the database ...") + listPath = pathName.keys() + listGID = self.addTypedGroups( + typeID["pathway"], + ((subtypeID["-"], pathName[pathID], None) for pathID in listPath), + ) + pathGID = dict(zip(listPath, listGID)) + self.log(" OK\n") + + # store pathway names + self.log("writing pathway names to the database ...") + self.addGroupNamespacedNames( + namespaceID["netpath_id"], + ((pathGID[pathID], pathID) for pathID in listPath), + ) + self.addGroupNamespacedNames( + namespaceID["pathway"], + ((pathGID[pathID], pathName[pathID]) for pathID in listPath), + ) + self.log(" OK\n") + + # store gene associations + self.log("writing gene associations to the database ...") + for ns in nsAssoc: + self.addGroupMemberTypedNamespacedNames( + typeID["gene"], + namespaceID[ns], + ((pathGID[assoc[0]], assoc[1], assoc[2]) for assoc in nsAssoc[ns]), + ) + self.log(" OK\n") + + # update() + + +# Source_netpath