From a27a9b4871803cdd91d250dba7fa8be484f15a44 Mon Sep 17 00:00:00 2001 From: Andreas Heger Date: Mon, 22 May 2017 11:49:43 +0100 Subject: [PATCH 01/26] {AH} fix wrong comparison --- CGAT/scripts/psl2psl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CGAT/scripts/psl2psl.py b/CGAT/scripts/psl2psl.py index cc7a4770..4b716aaa 100644 --- a/CGAT/scripts/psl2psl.py +++ b/CGAT/scripts/psl2psl.py @@ -194,7 +194,7 @@ def iterator_psl_intervals(options): except KeyError: tx = [] - if options.stdlog >= 2: + if options.loglevel >= 2: options.stdlog.write( "###################################################\n") options.stdlog.write("# testing %s\n" % (str(match))) From 7df645297c5e808bc9916f5bfa2048aea2e974f0 Mon Sep 17 00:00:00 2001 From: Andreas Heger Date: Mon, 22 May 2017 13:10:41 +0100 Subject: [PATCH 02/26] {AH} add encoding option to openFile --- CGAT/IOTools.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CGAT/IOTools.py b/CGAT/IOTools.py index 00b33efe..e3f4aa69 100644 --- a/CGAT/IOTools.py +++ b/CGAT/IOTools.py @@ -201,7 +201,7 @@ def touchFile(filename, times=None): fhandle.close() -def openFile(filename, mode="r", create_dir=False): +def openFile(filename, mode="r", create_dir=False, encoding="utf-8"): '''open file called *filename* with mode *mode*. gzip - compressed files are recognized by the @@ -235,9 +235,9 @@ def openFile(filename, mode="r", create_dir=False): if ext.lower() in (".gz", ".z"): if sys.version_info.major >= 3: if mode == "r": - return gzip.open(filename, 'rt', encoding="ascii") + return gzip.open(filename, 'rt', encoding=encoding) elif mode == "w": - return gzip.open(filename, 'wt', encoding="ascii") + return gzip.open(filename, 'wt', encoding=encoding) else: raise NotImplementedError( "mode '{}' not implemented".format(mode)) From aa8451694de2db419d731c39b27187d85c381bab Mon Sep 17 00:00:00 2001 From: Andreas Heger Date: Mon, 22 May 2017 13:11:06 +0100 Subject: [PATCH 03/26] {AH} raise error if BAM does not contain read names --- CGAT/scripts/_bam2stats.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CGAT/scripts/_bam2stats.pyx b/CGAT/scripts/_bam2stats.pyx index 4ece634f..d6d55240 100644 --- a/CGAT/scripts/_bam2stats.pyx +++ b/CGAT/scripts/_bam2stats.pyx @@ -145,6 +145,9 @@ def count(AlignmentFile samfile, if count_fastq: read_name = pysam_bam_get_qname(read._delegate) + if read_name == NULL: + raise ValueError("file does not contain read names, can't count using fastq") + # terminate string at first space to # truncate read names containing more than # just the id From b16f691f925e050ff784ffe84a0dead56a698a4c Mon Sep 17 00:00:00 2001 From: Andreas Heger Date: Mon, 22 May 2017 16:29:24 +0100 Subject: [PATCH 04/26] {AH} log filename for easier debugging --- CGAT/Expression.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CGAT/Expression.py b/CGAT/Expression.py index 3a298baf..33d2cbfb 100644 --- a/CGAT/Expression.py +++ b/CGAT/Expression.py @@ -2168,7 +2168,9 @@ def runEdgeR(outfile, ref_regex) # output heatmap plot - R.png('%(outfile_prefix)sheatmap.png' % locals()) + fn = '%(outfile_prefix)sheatmap.png' % locals() + E.info("outputing heatmap to {}".format(fn)) + R.png(fn) plotCorrelationHeatmap() r['dev.off']() From 5fbdcf5b8e33846d71d08e10f8c634c2d123eb72 Mon Sep 17 00:00:00 2001 From: Andreas Heger Date: Mon, 22 May 2017 16:30:07 +0100 Subject: [PATCH 05/26] {AH} fix byte/string issue with touching gzipped files --- CGAT/IOTools.py | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/CGAT/IOTools.py b/CGAT/IOTools.py index e3f4aa69..47b49a2a 100644 --- a/CGAT/IOTools.py +++ b/CGAT/IOTools.py @@ -189,11 +189,13 @@ def touchFile(filename, times=None): as empty 'gzip' files, i.e., with a header. ''' existed = os.path.exists(filename) - fhandle = open(filename, 'a') if filename.endswith(".gz") and not existed: # this will automatically add a gzip header + fhandle = open(filename, 'a+b') fhandle = gzip.GzipFile(filename, fileobj=fhandle) + else: + fhandle = open(filename, 'a') try: os.utime(filename, times) @@ -238,13 +240,12 @@ def openFile(filename, mode="r", create_dir=False, encoding="utf-8"): return gzip.open(filename, 'rt', encoding=encoding) elif mode == "w": return gzip.open(filename, 'wt', encoding=encoding) - else: - raise NotImplementedError( - "mode '{}' not implemented".format(mode)) + elif mode == "a": + return gzip.open(filename, 'wt', encoding=encoding) else: return gzip.open(filename, mode) else: - return open(filename, mode) + return open(filename, mode, encoding=encoding) def force_str(iterator, encoding="ascii"): @@ -812,14 +813,6 @@ def __init__(self, self.mFiles = {} self.mOutputPattern = output_pattern - - self.open = open - - if output_pattern: - _, ext = os.path.splitext(output_pattern) - if ext.lower() in (".gz", ".z"): - self.open = gzip.open - self.mCounts = collections.defaultdict(int) self.mHeader = header if force and output_pattern: @@ -881,7 +874,7 @@ def openFile(self, filename, mode="w"): if dirname and not os.path.exists(dirname): os.makedirs(dirname) - return self.open(filename, mode) + return openFile(filename, mode) def write(self, identifier, line): """write `line` to file specified by `identifier`""" @@ -894,7 +887,7 @@ def write(self, identifier, line): f.close() self.mFiles = {} - self.mFiles[filename] = self.openFile(filename, "a") + self.mFiles[filename] = openFile(filename, "a") if self.mHeader: self.mFiles[filename].write(self.mHeader) @@ -947,7 +940,7 @@ def close(self): raise IOError("write on closed FilePool in close()") for filename, data in self.data.items(): - f = self.openFile(filename, "a") + f = openFile(filename, "a") if self.mHeader: f.write(self.mHeader) f.write("".join(data)) From 9e8e46092439a071d6fee547ef4a10ce6c03ebbc Mon Sep 17 00:00:00 2001 From: Andreas Heger Date: Mon, 22 May 2017 16:30:26 +0100 Subject: [PATCH 06/26] {AH} open temp file in text mode --- CGAT/Masker.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/CGAT/Masker.py b/CGAT/Masker.py index ea97b8cd..12b54853 100644 --- a/CGAT/Masker.py +++ b/CGAT/Masker.py @@ -141,12 +141,9 @@ def maskSequence(self, peptide_sequence): def maskSequences(self, sequences): '''mask a collection of sequences.''' - outfile, infile = tempfile.mkstemp() - - for x, s in enumerate(sequences): - os.write(outfile, ">%i\n%s\n" % (x, s)) - - os.close(outfile) + with tempfile.NamedTemporaryFile(mode="w+t", delete=False) as infile: + for x, s in enumerate(sequences): + infile.write(">%i\n%s\n" % (x, s)) statement = self.mCommand % locals() @@ -168,7 +165,7 @@ def maskSequences(self, sequences): result = [ x.sequence for x in FastaIterator.iterate(StringIO(out))] - os.remove(infile) + os.remove(infile.name) return result From 61edc07a963db559e5c428ba8906b192cb3ff2f5 Mon Sep 17 00:00:00 2001 From: Andreas Heger Date: Mon, 22 May 2017 16:30:41 +0100 Subject: [PATCH 07/26] {AH} use globals in exec --- CGAT/scripts/csv_select.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/CGAT/scripts/csv_select.py b/CGAT/scripts/csv_select.py index 1aceb247..2f81b71b 100644 --- a/CGAT/scripts/csv_select.py +++ b/CGAT/scripts/csv_select.py @@ -80,8 +80,7 @@ def main(argv=None): reader = csv.DictReader(CSV.CommentStripper(sys.stdin), dialect=options.csv_dialect) - exec("f = lambda r: %s" % statement, locals()) - + exec("f = lambda r: %s" % statement, globals()) counter = E.Counter() writer = csv.DictWriter(options.stdout, reader.fieldnames, From dd5348516efd4774cafab0da12ca73dee84871da Mon Sep 17 00:00:00 2001 From: Andreas Heger Date: Mon, 22 May 2017 16:31:02 +0100 Subject: [PATCH 08/26] {AH} open temp file in text mode --- CGAT/scripts/runExpression.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CGAT/scripts/runExpression.py b/CGAT/scripts/runExpression.py index 8d763768..9f30713c 100644 --- a/CGAT/scripts/runExpression.py +++ b/CGAT/scripts/runExpression.py @@ -284,7 +284,7 @@ def main(argv=None): (options, args) = E.Start(parser, argv=argv, add_output_options=True) if options.input_filename_tags == "-": - fh = tempfile.NamedTemporaryFile(delete=False) + fh = tempfile.NamedTemporaryFile(delete=False, mode="w+t") fh.write("".join([x for x in options.stdin])) fh.close() options.input_filename_tags = fh.name From 8e6580af9f4ba9a97351912b45fa2b3e8bcf7b32 Mon Sep 17 00:00:00 2001 From: Andreas Heger Date: Mon, 22 May 2017 16:56:15 +0100 Subject: [PATCH 09/26] {AH} decode subprocess output for parsing --- CGAT/Masker.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/CGAT/Masker.py b/CGAT/Masker.py index 12b54853..2582fd28 100644 --- a/CGAT/Masker.py +++ b/CGAT/Masker.py @@ -141,10 +141,11 @@ def maskSequence(self, peptide_sequence): def maskSequences(self, sequences): '''mask a collection of sequences.''' - with tempfile.NamedTemporaryFile(mode="w+t", delete=False) as infile: + with tempfile.NamedTemporaryFile(mode="w+t", delete=False) as outf: for x, s in enumerate(sequences): - infile.write(">%i\n%s\n" % (x, s)) + outf.write(">%i\n%s\n" % (x, s)) + infile = outf.name statement = self.mCommand % locals() E.debug("statement: %s" % statement) @@ -161,11 +162,11 @@ def maskSequences(self, sequences): raise RuntimeError( "Error in running %s \n%s\nTemporary directory" % (statement, err)) - + result = [ - x.sequence for x in FastaIterator.iterate(StringIO(out))] + x.sequence for x in FastaIterator.iterate(StringIO(out.decode()))] - os.remove(infile.name) + os.remove(infile) return result From 8b3ffa52e825d144a5dda7ef7353f2562e3f3e64 Mon Sep 17 00:00:00 2001 From: MikeDMorgan Date: Sun, 2 Apr 2017 08:53:38 +0100 Subject: [PATCH 10/26] add cassi results format parsing --- CGAT/PipelineGWAS.py | 43 +++++++++++++++++++++++++------- CGAT/scripts/assoc2plot.py | 14 ++++++++--- CGAT/scripts/extract_stats.py | 46 ++++++++++++++++++++++++++++++++--- R/epistasis_covar.R | 18 +++++++++----- 4 files changed, 99 insertions(+), 22 deletions(-) diff --git a/CGAT/PipelineGWAS.py b/CGAT/PipelineGWAS.py index 8ec1945a..47a1811c 100755 --- a/CGAT/PipelineGWAS.py +++ b/CGAT/PipelineGWAS.py @@ -2184,9 +2184,10 @@ def parse_genome_wide(self, association_files): return df def get_results(self, association_file, - epistasis=False): + epistasis=False, + file_format="plink"): ''' - Parse a GWA results file and return the table + Parse a GWA or epistasis results file and return the table ''' # use Pandas for now - try something different later @@ -2216,16 +2217,40 @@ def get_results(self, association_file, index_col=None) except StopIteration: results_frame = pd.read_table(association_file, - sep="\t", header=None, + sep="\t", header=0, index_col=None) # results from fast epistasis are different to others - if results_frame.shape[1] == 7: - results_frame.columns = ["CHR1", "SNP1", "CHR", - "SNP", "OR", "STAT", "P"] - else: - results_frame.columns = ["CHR", "SNP", "BP", "A1", "OR", - "SE", "STAT", "P"] + if file_format == "cassi_covar": + if results_frme.shape[1] == 12: + results_frame.columns = ["SNP1", "CHR1", "ID1", "BP1", + "SNP2", "CHR2", "ID2", "BP2", + "OR", "SE", "STAT", "P"] + elif results_frame.shape[1] == 14: + results_frame.columns = ["SNP1", "CHR1", "ID1", "BP1", + "SNP2", "CHR2", "ID2", "BP2", + "OR", "SE", "STAT", "P", + "CASE_RSQ", "CTRL_RSQ"] + elif results_frame.shape[1] == 16: + results_frame.columns = ["SNP1", "CHR1", "ID1", "BP", + "SNP2", "CHR2", "ID2", "BP2", + "OR", "SE", "STAT", "P", + "CASE_RSQ", "CTRL_RSQ", + "CASE_DPRIME" "CTRL_DPRIME"] + results_frame.loc[:, "BP"] = pd.to_numeric(results_frame["BP"], + errors="coerce") + elif file_format == "cassi": + pass + elif file_format == "plink": + if results_frame.shape[1] == 7: + results_frame.columns = ["CHR1", "SNP1", "CHR", + "SNP", "OR", "STAT", "P"] + elif results_frame.shape[1] == 9: + results_frame.columns = ["CHR", "SNP", "BP", "A1", "NMISS", + "OR", "SE", "STAT", "P"] + else: + results_frame.columns = ["CHR", "SNP", "BP", "A1", "OR", + "SE", "STAT", "P"] results_frame.loc[:, "BP"] = pd.to_numeric(results_frame["BP"], errors="coerce") diff --git a/CGAT/scripts/assoc2plot.py b/CGAT/scripts/assoc2plot.py index c95813cc..ad6c3c65 100755 --- a/CGAT/scripts/assoc2plot.py +++ b/CGAT/scripts/assoc2plot.py @@ -61,6 +61,11 @@ def main(argv=None): "depicts the whole genome, a single chromosome or " "a specific locus") + parser.add_option("--file-format", dest="file_format", type="choice", + choices=["plink", "cassi", "cassi_covar"], + help="input file format, used to parse the file " + "properly") + parser.add_option("--save-path", dest="save_path", type="string", help="path and filename to save image to") @@ -68,7 +73,8 @@ def main(argv=None): (options, args) = E.Start(parser, argv=argv) parser.set_defaults(resolution="genome_wide", - plot_type="manhattan") + plot_type="manhattan", + file_format="plink") # if the input is a list of files, split them infile = argv[-1] @@ -82,10 +88,12 @@ def main(argv=None): if len(infiles) > 1: results = gwas.GWASResults(assoc_file=infiles, - epistasis=epi) + epistasis=epi, + file_format=options.file_format) elif len(infiles) == 1: results = gwas.GWASResults(assoc_file=infile, - epistasis=epi) + epistasis=epi, + file_format=options.file_format) else: raise IOError("no input files detected, please specifiy association " "results files as the last command line argument") diff --git a/CGAT/scripts/extract_stats.py b/CGAT/scripts/extract_stats.py index 41139298..623865a6 100644 --- a/CGAT/scripts/extract_stats.py +++ b/CGAT/scripts/extract_stats.py @@ -52,15 +52,35 @@ import numpy as np import re import sqlite3 as sql +from sqlalchemy import * +import MySQLdb -def getTableFromDb(db, table): +def getTableFromDb(db, table, backend, + username, db_hostname, + db_port): ''' Get a table from a database with pandas ''' - state = ''' SELECT * FROM %(table)s;''' % locals() - dbh = sql.connect(db) + state = text(''' SELECT * FROM %(table)s;''' % locals()) + if backend == "sqlite": + create_string = "sqlite:///{}".format(db) + elif backend == "mysql": + dbhandle = MySQLdb.connect(host=db_hostname, + user=username, + passwd="", + port=db_port, + db=db) + + #create_string = '''mysql://cgat@gandalf.anat.ox.ac.uk/%(db)s''' % locals() + else: + raise TypeError("database backend not recognised") + + #engine = create_engine(create_string) + #dbh = engine.connect() + #dbh = sql.connect(db) + df = pdsql.read_sql(state, dbh) df.index = df["track"] df.drop(labels="track", inplace=True, @@ -219,6 +239,19 @@ def main(argv=None): parser.add_option("-d", "--database", dest="database", type="string", help="SQLite database containing relevant tables") + parser.add_option("--database-backend", dest="database_backend", + type="choice", choices=["sqlite", "mysql"], + help="database backend to use") + + parser.add_option("--database-username", dest="database_username", + type="string", help="MySQL username") + + parser.add_option("--database-hostname", dest="database_hostname", + type="string", help="MySQL server hostname") + + parser.add_option("--database-port", dest="databse_port", + type="string", help="MySQL server port") + parser.add_option("-t", "--table-name", dest="table", type="string", help="table in SQLite DB to extract") @@ -226,8 +259,13 @@ def main(argv=None): (options, args) = E.Start(parser, argv=argv) if options.task == "extract_table": + print options.database_hostname out_df = getTableFromDb(db=options.database, - table=options.table) + table=options.table, + backend=options.database_backend, + username=options.database_username, + db_hostname=options.database_hostname, + db_port=int(options.databse_port)) elif options.task == "get_coverage": out_df = getModelCoverage(db=options.database, diff --git a/R/epistasis_covar.R b/R/epistasis_covar.R index b0da405c..87eda449 100644 --- a/R/epistasis_covar.R +++ b/R/epistasis_covar.R @@ -8,15 +8,21 @@ Rplink <- function(PHENO, GENO, CLUSTER, COVAR){ epi.log <- function(snp){ # covar positions are based on the input data table, # not the position they are passed in! - # Assume SNP is the first column - test.snp = COVAR[, 1] - covars = COVAR[,(2:dim(COVAR)[2])] - m <- glm(PHENO == 2 ~ snp + covars + test.snp*snp, family=binomial(link="logit")) + # Assume SNP is the last column + n.covar <- dim(COVAR)[2] + test.snp <- COVAR[, n.covar] + if(dim(COVAR)[2] > 2){ + covars <- COVAR[, 1:(n.covar - 1)] + } + else { + covars <- as.matrix(COVAR[, 2], ncol=1) + } + m <- glm(PHENO == 2 ~ covars + test.snp + snp + test.snp*snp, family=binomial(link="logit")) len <- dim(summary(m)$coefficients)[1] sum.snp <- summary(m)$coefficients[len,] ors <- exp(sum.snp[1]) - r <- c(ors, sum.snp[2], sum.snp[3], sum.snp[4]) - names(r) <- c("OR", "SE", "STAT", "P") + r <- c(dim(covars)[1], ors, sum.snp[2], sum.snp[3], sum.snp[4]) + names(r) <- c("NMISS", "OR", "SE", "STAT", "P") return(c(length(r), r)) } From e06c9f252c8c9c5e852098937fa6bf321a0ce9ef Mon Sep 17 00:00:00 2001 From: MikeDMorgan Date: Mon, 22 May 2017 20:43:10 +0100 Subject: [PATCH 11/26] add correct extract_stats.py for pipeline tests --- CGAT/scripts/extract_stats.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CGAT/scripts/extract_stats.py b/CGAT/scripts/extract_stats.py index 623865a6..23ff7bfe 100644 --- a/CGAT/scripts/extract_stats.py +++ b/CGAT/scripts/extract_stats.py @@ -63,9 +63,11 @@ def getTableFromDb(db, table, backend, Get a table from a database with pandas ''' - state = text(''' SELECT * FROM %(table)s;''' % locals()) + state = '''SELECT * FROM %(table)s''' % locals() + # state = text('''SELECT * FROM %(table)s;''' % locals()) if backend == "sqlite": create_string = "sqlite:///{}".format(db) + dbh = sql.connect(db) elif backend == "mysql": dbhandle = MySQLdb.connect(host=db_hostname, user=username, @@ -79,9 +81,8 @@ def getTableFromDb(db, table, backend, #engine = create_engine(create_string) #dbh = engine.connect() - #dbh = sql.connect(db) - df = pdsql.read_sql(state, dbh) + df = pdsql.read_sql(sql=state, con=dbh) df.index = df["track"] df.drop(labels="track", inplace=True, axis=1) @@ -259,7 +260,6 @@ def main(argv=None): (options, args) = E.Start(parser, argv=argv) if options.task == "extract_table": - print options.database_hostname out_df = getTableFromDb(db=options.database, table=options.table, backend=options.database_backend, From 739078b1228aee7b7746959ce6103e56e7a09782 Mon Sep 17 00:00:00 2001 From: MikeDMorgan Date: Mon, 22 May 2017 22:50:37 +0100 Subject: [PATCH 12/26] fix pep8 error --- CGAT/PipelineGWAS.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CGAT/PipelineGWAS.py b/CGAT/PipelineGWAS.py index 47a1811c..002d8db3 100755 --- a/CGAT/PipelineGWAS.py +++ b/CGAT/PipelineGWAS.py @@ -2238,7 +2238,7 @@ def get_results(self, association_file, "CASE_RSQ", "CTRL_RSQ", "CASE_DPRIME" "CTRL_DPRIME"] results_frame.loc[:, "BP"] = pd.to_numeric(results_frame["BP"], - errors="coerce") + errors="coerce") elif file_format == "cassi": pass elif file_format == "plink": From a2d0f2fe5ffefcc378a617ed601b1a603e5b2c75 Mon Sep 17 00:00:00 2001 From: Andreas Heger Date: Tue, 23 May 2017 13:46:11 +0100 Subject: [PATCH 13/26] {AH} add GTF_test --- tests/GTF_test.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 tests/GTF_test.py diff --git a/tests/GTF_test.py b/tests/GTF_test.py new file mode 100644 index 00000000..52179515 --- /dev/null +++ b/tests/GTF_test.py @@ -0,0 +1,21 @@ +import unittest +import os + +import CGAT.GTF as GTF +import CGAT.IOTools as IOTools + +class TestIteration(unittest.TestCase): + + filename = os.path.join("data", "hg19.small.gtf.gz") + + def test_number_of_intervals_is_correct(self): + + with IOTools.openFile(self.filename) as inf: + records = list(GTF.iterator(inf)) + + self.assertEqual(len(records), + 100) + + +if __name__ == "__main__": + unittest.main() From 35be2f385d893560dc5187f242b046ac9efd2048 Mon Sep 17 00:00:00 2001 From: Andreas Heger Date: Tue, 23 May 2017 13:50:16 +0100 Subject: [PATCH 14/26] {AH} --- CGAT/FastaIterator.py | 1 - 1 file changed, 1 deletion(-) diff --git a/CGAT/FastaIterator.py b/CGAT/FastaIterator.py index f8508929..d941eade 100644 --- a/CGAT/FastaIterator.py +++ b/CGAT/FastaIterator.py @@ -92,7 +92,6 @@ def iterate(infile, comment="#", fold=False): ------ FastaRecord ''' - h = infile.readline()[:-1] if not h: From f4b878c2acd0c9936dc4c0e971c5506ab1359a2b Mon Sep 17 00:00:00 2001 From: Andreas Heger Date: Tue, 23 May 2017 13:56:29 +0100 Subject: [PATCH 15/26] {AH} open file in text mode --- CGAT/scripts/cgat_fasta2cDNA.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/CGAT/scripts/cgat_fasta2cDNA.py b/CGAT/scripts/cgat_fasta2cDNA.py index 3decf5c4..7aa7cd5e 100644 --- a/CGAT/scripts/cgat_fasta2cDNA.py +++ b/CGAT/scripts/cgat_fasta2cDNA.py @@ -1,18 +1,12 @@ ''' -cgat_fasta2cDNA.py - template for CGAT scripts -==================================================== +cgat_fasta2cDNA.py - converting multi-fasta of exon features into a multi-fasta of spliced cDNAs/RNAs +====================================================================================================== -:Author: -:Release: $Id$ -:Date: |today| :Tags: Python Purpose ------- -.. Mike transcript processing - converting multi-fasta of exon -features into a multi-fasta of spliced cDNAs/RNAs - Usage ----- @@ -45,7 +39,7 @@ def makeSplicedFasta(infile): ''' fasta_dict = {} - with IOTools.openFile(infile, "rb") as fafile: + with IOTools.openFile(infile) as fafile: for line in fafile.readlines(): if line[0] == '>': header = line.rstrip("\n") From d706a14c19f30eaad0513e6ec73ab0edcf7c3a9f Mon Sep 17 00:00:00 2001 From: Andreas Heger Date: Tue, 23 May 2017 16:43:16 +0100 Subject: [PATCH 16/26] {AH} py3 fixes --- CGAT/scripts/extract_stats.py | 6 ++++-- tests/GTF_test.py | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/CGAT/scripts/extract_stats.py b/CGAT/scripts/extract_stats.py index 23ff7bfe..d1799cec 100644 --- a/CGAT/scripts/extract_stats.py +++ b/CGAT/scripts/extract_stats.py @@ -95,9 +95,11 @@ def cleanStatsTable(stats_file): Take in a table containing aggregated stats and clean by removing duplicate columns ''' - + # , mangle_dupe_cols=False) + # AH: disabled, because "ValueError: Setting mangle_dupe_cols=False is not supported yet" _df = pd.read_table(stats_file, sep="\t", header=0, - index_col=None, mangle_dupe_cols=False) + index_col=None) + # drop duplicates is case sensitive, convert all to # same case - SQL is not case sensitive so will throw # a hissy fit for same column names in different cases diff --git a/tests/GTF_test.py b/tests/GTF_test.py index 52179515..7990c7d2 100644 --- a/tests/GTF_test.py +++ b/tests/GTF_test.py @@ -4,15 +4,16 @@ import CGAT.GTF as GTF import CGAT.IOTools as IOTools + class TestIteration(unittest.TestCase): filename = os.path.join("data", "hg19.small.gtf.gz") def test_number_of_intervals_is_correct(self): - + with IOTools.openFile(self.filename) as inf: records = list(GTF.iterator(inf)) - + self.assertEqual(len(records), 100) From f03c2112cd8022305d1e95a48e7ec94cbdc88291 Mon Sep 17 00:00:00 2001 From: Andreas Heger Date: Mon, 5 Jun 2017 10:38:52 +0100 Subject: [PATCH 17/26] {AH} fix output sort order --- CGAT/scripts/cgat_fasta2cDNA.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CGAT/scripts/cgat_fasta2cDNA.py b/CGAT/scripts/cgat_fasta2cDNA.py index 7aa7cd5e..1b2593cc 100644 --- a/CGAT/scripts/cgat_fasta2cDNA.py +++ b/CGAT/scripts/cgat_fasta2cDNA.py @@ -47,7 +47,7 @@ def makeSplicedFasta(infile): else: fasta_dict[header] += line.rstrip("\n") - for key, value in fasta_dict.items(): + for key, value in sorted(fasta_dict.items()): yield "%s\n%s\n" % (key, value) From 22f85982b7f1962118ff2aecbbdc9fd114bfeeca Mon Sep 17 00:00:00 2001 From: Sebastian Luna Valero Date: Thu, 8 Jun 2017 09:24:34 +0100 Subject: [PATCH 18/26] remove Py2 testing in Py3 branch --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index f991b6e9..eb0b8f64 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,7 @@ env: - TEST_ALL=1 python: - - "2.7" +# - "2.7" - "3.5" # Using xvfb to Run Tests That Require a GUI From 34ef3666fc2c7b54b41da5fa6fd8a46c8da89aca Mon Sep 17 00:00:00 2001 From: Sebastian Luna Valero Date: Thu, 8 Jun 2017 09:43:37 +0100 Subject: [PATCH 19/26] force icu update --- install-CGAT-tools.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install-CGAT-tools.sh b/install-CGAT-tools.sh index 53429557..58ceb963 100755 --- a/install-CGAT-tools.sh +++ b/install-CGAT-tools.sh @@ -292,7 +292,7 @@ conda create -q -n $CONDA_INSTALL_ENV $CONDA_INSTALL_TYPE python=$INSTALL_PYTHON # SLV brute force until pysam problem is solved conda uninstall -n $CONDA_INSTALL_ENV pysam --override-channels --channel conda-forge --channel defaults --channel r --channel bioconda --yes -conda install -n $CONDA_INSTALL_ENV pysam=0.11.1 pybedtools --override-channels --channel conda-forge --channel defaults --channel r --channel bioconda --yes +conda install -n $CONDA_INSTALL_ENV pysam=0.11.1 pybedtools icu=58 --override-channels --channel conda-forge --channel defaults --channel r --channel bioconda --yes log "installing CGAT code into conda environment" # if installation is 'devel' (outside of travis), checkout latest version from github From d0a9d3696a0b503f095d1b6c0fd9f8186405f11c Mon Sep 17 00:00:00 2001 From: Sebastian Luna Valero Date: Thu, 8 Jun 2017 10:15:19 +0100 Subject: [PATCH 20/26] try different channel order for conda --- install-CGAT-tools.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/install-CGAT-tools.sh b/install-CGAT-tools.sh index 58ceb963..763fd5a2 100755 --- a/install-CGAT-tools.sh +++ b/install-CGAT-tools.sh @@ -291,8 +291,8 @@ log "installing conda CGAT environment" conda create -q -n $CONDA_INSTALL_ENV $CONDA_INSTALL_TYPE python=$INSTALL_PYTHON_VERSION gcc --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes # SLV brute force until pysam problem is solved -conda uninstall -n $CONDA_INSTALL_ENV pysam --override-channels --channel conda-forge --channel defaults --channel r --channel bioconda --yes -conda install -n $CONDA_INSTALL_ENV pysam=0.11.1 pybedtools icu=58 --override-channels --channel conda-forge --channel defaults --channel r --channel bioconda --yes +conda uninstall -n $CONDA_INSTALL_ENV pysam --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes +conda install -n $CONDA_INSTALL_ENV pysam=0.11.1 pybedtools --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes log "installing CGAT code into conda environment" # if installation is 'devel' (outside of travis), checkout latest version from github From edae8e020bb0760dcaf361193e7e1245ad90a67c Mon Sep 17 00:00:00 2001 From: Sebastian Luna Valero Date: Thu, 8 Jun 2017 10:30:31 +0100 Subject: [PATCH 21/26] try different conda command --- install-CGAT-tools.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/install-CGAT-tools.sh b/install-CGAT-tools.sh index 763fd5a2..d72eb0fa 100755 --- a/install-CGAT-tools.sh +++ b/install-CGAT-tools.sh @@ -288,11 +288,12 @@ conda update -q conda --yes conda info -a log "installing conda CGAT environment" -conda create -q -n $CONDA_INSTALL_ENV $CONDA_INSTALL_TYPE python=$INSTALL_PYTHON_VERSION gcc --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes +#conda create -q -n $CONDA_INSTALL_ENV $CONDA_INSTALL_TYPE python=$INSTALL_PYTHON_VERSION gcc --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes +conda create -q -n $CONDA_INSTALL_ENV $CONDA_INSTALL_TYPE python=$INSTALL_PYTHON_VERSION pysam=0.11.1 r=3.3.1 gcc --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes # SLV brute force until pysam problem is solved -conda uninstall -n $CONDA_INSTALL_ENV pysam --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes -conda install -n $CONDA_INSTALL_ENV pysam=0.11.1 pybedtools --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes +#conda uninstall -n $CONDA_INSTALL_ENV pysam --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes +#conda install -n $CONDA_INSTALL_ENV pysam=0.11.1 pybedtools --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes log "installing CGAT code into conda environment" # if installation is 'devel' (outside of travis), checkout latest version from github From 27f65e0e8a79a98c71de2d0bf128337ae2a9f3cd Mon Sep 17 00:00:00 2001 From: Sebastian Luna Valero Date: Thu, 8 Jun 2017 10:54:24 +0100 Subject: [PATCH 22/26] try Py2 testing again --- .travis.yml | 2 +- install-CGAT-tools.sh | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index eb0b8f64..f991b6e9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,7 @@ env: - TEST_ALL=1 python: -# - "2.7" + - "2.7" - "3.5" # Using xvfb to Run Tests That Require a GUI diff --git a/install-CGAT-tools.sh b/install-CGAT-tools.sh index d72eb0fa..95cba8f2 100755 --- a/install-CGAT-tools.sh +++ b/install-CGAT-tools.sh @@ -288,13 +288,11 @@ conda update -q conda --yes conda info -a log "installing conda CGAT environment" -#conda create -q -n $CONDA_INSTALL_ENV $CONDA_INSTALL_TYPE python=$INSTALL_PYTHON_VERSION gcc --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes +# SLV workaround until problems are resolved for: +# * Latest R 3.3.2 available in conda: Problems with icu here https://travis-ci.org/CGATOxford/cgat/builds/240711411 +# * Latest pysam works with the CGAT code: the gff3 support is broken, best not to use 0.11.2 for now (AH comment) conda create -q -n $CONDA_INSTALL_ENV $CONDA_INSTALL_TYPE python=$INSTALL_PYTHON_VERSION pysam=0.11.1 r=3.3.1 gcc --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes -# SLV brute force until pysam problem is solved -#conda uninstall -n $CONDA_INSTALL_ENV pysam --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes -#conda install -n $CONDA_INSTALL_ENV pysam=0.11.1 pybedtools --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes - log "installing CGAT code into conda environment" # if installation is 'devel' (outside of travis), checkout latest version from github if [[ "$OS" != "travis" ]] ; then From 6399552f163df205be1dcec383149708b9cb4546 Mon Sep 17 00:00:00 2001 From: Sebastian Luna Valero Date: Thu, 8 Jun 2017 11:07:43 +0100 Subject: [PATCH 23/26] Py2's open() function does not support the 'encoding' argument so removing Py2 tests from now on --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index f991b6e9..eb0b8f64 100644 --- a/.travis.yml +++ b/.travis.yml @@ -20,7 +20,7 @@ env: - TEST_ALL=1 python: - - "2.7" +# - "2.7" - "3.5" # Using xvfb to Run Tests That Require a GUI From 007e34d23185e9e996c47ec01c1cade05ea495ea Mon Sep 17 00:00:00 2001 From: Sebastian Luna Valero Date: Thu, 8 Jun 2017 11:26:53 +0100 Subject: [PATCH 24/26] add important reference about issue with R 3.3.2 and icu library --- install-CGAT-tools.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/install-CGAT-tools.sh b/install-CGAT-tools.sh index 95cba8f2..7e0fbfc9 100755 --- a/install-CGAT-tools.sh +++ b/install-CGAT-tools.sh @@ -289,7 +289,7 @@ conda info -a log "installing conda CGAT environment" # SLV workaround until problems are resolved for: -# * Latest R 3.3.2 available in conda: Problems with icu here https://travis-ci.org/CGATOxford/cgat/builds/240711411 +# * Latest R 3.3.2 available in conda: Problems with icu here https://travis-ci.org/CGATOxford/cgat/builds/240711411 and here https://github.com/ContinuumIO/anaconda-issues/issues/1403 # * Latest pysam works with the CGAT code: the gff3 support is broken, best not to use 0.11.2 for now (AH comment) conda create -q -n $CONDA_INSTALL_ENV $CONDA_INSTALL_TYPE python=$INSTALL_PYTHON_VERSION pysam=0.11.1 r=3.3.1 gcc --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes From 3ac8b568329e13fba5574a3786776e69338a3879 Mon Sep 17 00:00:00 2001 From: Charlie-George Date: Tue, 27 Jun 2017 19:08:43 +0100 Subject: [PATCH 25/26] changed float to integer division for shift calculation for py3 compatability --- CGAT/scripts/bed2table.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CGAT/scripts/bed2table.py b/CGAT/scripts/bed2table.py index 5841f456..3a1cb26d 100644 --- a/CGAT/scripts/bed2table.py +++ b/CGAT/scripts/bed2table.py @@ -239,7 +239,7 @@ def _count(self, bed, bamfiles, offsets): # if offsets are given, shift tags. for samfile, offset in zip(bamfiles, offsets): - shift = offset / 2 + shift = offset // 2 # for peak counting I follow the MACS protocoll, # see the function def __tags_call_peak in PeakDetect.py # In words From 56ea408990877f7466795bdd55b194e4bd108eaa Mon Sep 17 00:00:00 2001 From: Andreas Heger Date: Mon, 26 Jun 2017 19:55:24 +0100 Subject: [PATCH 26/26] {AH} py3-fix for getLastLine --- CGAT/IOTools.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/CGAT/IOTools.py b/CGAT/IOTools.py index 47b49a2a..099a9e99 100644 --- a/CGAT/IOTools.py +++ b/CGAT/IOTools.py @@ -68,7 +68,7 @@ def getFirstLine(filename, nlines=1): return line -def getLastLine(filename, nlines=1, read_size=1024): +def getLastLine(filename, nlines=1, read_size=1024, encoding="utf-8"): """return the last line of a file. This method works by working back in blocks of `read_size` until @@ -90,8 +90,8 @@ def getLastLine(filename, nlines=1, read_size=1024): """ - # U is to open it with Universal newline support - f = open(filename, 'rU') + # py3 requires binary mode for negative seeks + f = open(filename, 'rb') offset = read_size f.seek(0, 2) file_size = f.tell() @@ -102,10 +102,8 @@ def getLastLine(filename, nlines=1, read_size=1024): offset = file_size f.seek(-1 * offset, 2) read_str = f.read(offset) - # Remove newline at the end - if read_str[offset - 1] == '\n': - read_str = read_str[:-1] - lines = read_str.split('\n') + read_str = read_str.decode(encoding) + lines = read_str.strip().splitlines() if len(lines) >= nlines + 1: return "\n".join(lines[-nlines:]) if offset == file_size: # reached the beginning