From a27a9b4871803cdd91d250dba7fa8be484f15a44 Mon Sep 17 00:00:00 2001
From: Andreas Heger <andreas.heger@gmail.com>
Date: Mon, 22 May 2017 11:49:43 +0100
Subject: [PATCH 01/26] {AH} fix wrong comparison

---
 CGAT/scripts/psl2psl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CGAT/scripts/psl2psl.py b/CGAT/scripts/psl2psl.py
index cc7a4770..4b716aaa 100644
--- a/CGAT/scripts/psl2psl.py
+++ b/CGAT/scripts/psl2psl.py
@@ -194,7 +194,7 @@ def iterator_psl_intervals(options):
             except KeyError:
                 tx = []
 
-        if options.stdlog >= 2:
+        if options.loglevel >= 2:
             options.stdlog.write(
                 "###################################################\n")
             options.stdlog.write("# testing %s\n" % (str(match)))

From 7df645297c5e808bc9916f5bfa2048aea2e974f0 Mon Sep 17 00:00:00 2001
From: Andreas Heger <andreas.heger@gmail.com>
Date: Mon, 22 May 2017 13:10:41 +0100
Subject: [PATCH 02/26] {AH} add encoding option to openFile

---
 CGAT/IOTools.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/CGAT/IOTools.py b/CGAT/IOTools.py
index 00b33efe..e3f4aa69 100644
--- a/CGAT/IOTools.py
+++ b/CGAT/IOTools.py
@@ -201,7 +201,7 @@ def touchFile(filename, times=None):
         fhandle.close()
 
 
-def openFile(filename, mode="r", create_dir=False):
+def openFile(filename, mode="r", create_dir=False, encoding="utf-8"):
     '''open file called *filename* with mode *mode*.
 
     gzip - compressed files are recognized by the
@@ -235,9 +235,9 @@ def openFile(filename, mode="r", create_dir=False):
     if ext.lower() in (".gz", ".z"):
         if sys.version_info.major >= 3:
             if mode == "r":
-                return gzip.open(filename, 'rt', encoding="ascii")
+                return gzip.open(filename, 'rt', encoding=encoding)
             elif mode == "w":
-                return gzip.open(filename, 'wt', encoding="ascii")
+                return gzip.open(filename, 'wt', encoding=encoding)
             else:
                 raise NotImplementedError(
                     "mode '{}' not implemented".format(mode))

From aa8451694de2db419d731c39b27187d85c381bab Mon Sep 17 00:00:00 2001
From: Andreas Heger <andreas.heger@gmail.com>
Date: Mon, 22 May 2017 13:11:06 +0100
Subject: [PATCH 03/26] {AH} raise error if BAM does not contain read names

---
 CGAT/scripts/_bam2stats.pyx | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CGAT/scripts/_bam2stats.pyx b/CGAT/scripts/_bam2stats.pyx
index 4ece634f..d6d55240 100644
--- a/CGAT/scripts/_bam2stats.pyx
+++ b/CGAT/scripts/_bam2stats.pyx
@@ -145,6 +145,9 @@ def count(AlignmentFile samfile,
 
         if count_fastq:
             read_name = pysam_bam_get_qname(read._delegate)
+            if read_name == NULL:
+                raise ValueError("file does not contain read names, can't count using fastq")
+
             # terminate string at first space to
             # truncate read names containing more than
             # just the id

From b16f691f925e050ff784ffe84a0dead56a698a4c Mon Sep 17 00:00:00 2001
From: Andreas Heger <andreas.heger@gmail.com>
Date: Mon, 22 May 2017 16:29:24 +0100
Subject: [PATCH 04/26] {AH} log filename for easier debugging

---
 CGAT/Expression.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CGAT/Expression.py b/CGAT/Expression.py
index 3a298baf..33d2cbfb 100644
--- a/CGAT/Expression.py
+++ b/CGAT/Expression.py
@@ -2168,7 +2168,9 @@ def runEdgeR(outfile,
                                                             ref_regex)
 
     # output heatmap plot
-    R.png('%(outfile_prefix)sheatmap.png' % locals())
+    fn = '%(outfile_prefix)sheatmap.png' % locals()
+    E.info("outputing heatmap to {}".format(fn))
+    R.png(fn)
     plotCorrelationHeatmap()
     r['dev.off']()
 

From 5fbdcf5b8e33846d71d08e10f8c634c2d123eb72 Mon Sep 17 00:00:00 2001
From: Andreas Heger <andreas.heger@gmail.com>
Date: Mon, 22 May 2017 16:30:07 +0100
Subject: [PATCH 05/26] {AH} fix byte/string issue with touching gzipped files

---
 CGAT/IOTools.py | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/CGAT/IOTools.py b/CGAT/IOTools.py
index e3f4aa69..47b49a2a 100644
--- a/CGAT/IOTools.py
+++ b/CGAT/IOTools.py
@@ -189,11 +189,13 @@ def touchFile(filename, times=None):
     as empty 'gzip' files, i.e., with a header.
     '''
     existed = os.path.exists(filename)
-    fhandle = open(filename, 'a')
 
     if filename.endswith(".gz") and not existed:
         # this will automatically add a gzip header
+        fhandle = open(filename, 'a+b')
         fhandle = gzip.GzipFile(filename, fileobj=fhandle)
+    else:
+        fhandle = open(filename, 'a')
 
     try:
         os.utime(filename, times)
@@ -238,13 +240,12 @@ def openFile(filename, mode="r", create_dir=False, encoding="utf-8"):
                 return gzip.open(filename, 'rt', encoding=encoding)
             elif mode == "w":
                 return gzip.open(filename, 'wt', encoding=encoding)
-            else:
-                raise NotImplementedError(
-                    "mode '{}' not implemented".format(mode))
+            elif mode == "a":
+                return gzip.open(filename, 'wt', encoding=encoding)
         else:
             return gzip.open(filename, mode)
     else:
-        return open(filename, mode)
+        return open(filename, mode, encoding=encoding)
 
 
 def force_str(iterator, encoding="ascii"):
@@ -812,14 +813,6 @@ def __init__(self,
 
         self.mFiles = {}
         self.mOutputPattern = output_pattern
-
-        self.open = open
-
-        if output_pattern:
-            _, ext = os.path.splitext(output_pattern)
-            if ext.lower() in (".gz", ".z"):
-                self.open = gzip.open
-
         self.mCounts = collections.defaultdict(int)
         self.mHeader = header
         if force and output_pattern:
@@ -881,7 +874,7 @@ def openFile(self, filename, mode="w"):
             if dirname and not os.path.exists(dirname):
                 os.makedirs(dirname)
 
-        return self.open(filename, mode)
+        return openFile(filename, mode)
 
     def write(self, identifier, line):
         """write `line` to file specified by `identifier`"""
@@ -894,7 +887,7 @@ def write(self, identifier, line):
                     f.close()
                 self.mFiles = {}
 
-            self.mFiles[filename] = self.openFile(filename, "a")
+            self.mFiles[filename] = openFile(filename, "a")
             if self.mHeader:
                 self.mFiles[filename].write(self.mHeader)
 
@@ -947,7 +940,7 @@ def close(self):
             raise IOError("write on closed FilePool in close()")
 
         for filename, data in self.data.items():
-            f = self.openFile(filename, "a")
+            f = openFile(filename, "a")
             if self.mHeader:
                 f.write(self.mHeader)
             f.write("".join(data))

From 9e8e46092439a071d6fee547ef4a10ce6c03ebbc Mon Sep 17 00:00:00 2001
From: Andreas Heger <andreas.heger@gmail.com>
Date: Mon, 22 May 2017 16:30:26 +0100
Subject: [PATCH 06/26] {AH} open temp file in text mode

---
 CGAT/Masker.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/CGAT/Masker.py b/CGAT/Masker.py
index ea97b8cd..12b54853 100644
--- a/CGAT/Masker.py
+++ b/CGAT/Masker.py
@@ -141,12 +141,9 @@ def maskSequence(self, peptide_sequence):
     def maskSequences(self, sequences):
         '''mask a collection of sequences.'''
 
-        outfile, infile = tempfile.mkstemp()
-
-        for x, s in enumerate(sequences):
-            os.write(outfile, ">%i\n%s\n" % (x, s))
-
-        os.close(outfile)
+        with tempfile.NamedTemporaryFile(mode="w+t", delete=False) as infile:
+            for x, s in enumerate(sequences):
+                infile.write(">%i\n%s\n" % (x, s))
 
         statement = self.mCommand % locals()
 
@@ -168,7 +165,7 @@ def maskSequences(self, sequences):
         result = [
             x.sequence for x in FastaIterator.iterate(StringIO(out))]
 
-        os.remove(infile)
+        os.remove(infile.name)
 
         return result
 

From 61edc07a963db559e5c428ba8906b192cb3ff2f5 Mon Sep 17 00:00:00 2001
From: Andreas Heger <andreas.heger@gmail.com>
Date: Mon, 22 May 2017 16:30:41 +0100
Subject: [PATCH 07/26] {AH} use globals in exec

---
 CGAT/scripts/csv_select.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/CGAT/scripts/csv_select.py b/CGAT/scripts/csv_select.py
index 1aceb247..2f81b71b 100644
--- a/CGAT/scripts/csv_select.py
+++ b/CGAT/scripts/csv_select.py
@@ -80,8 +80,7 @@ def main(argv=None):
         reader = csv.DictReader(CSV.CommentStripper(sys.stdin),
                                 dialect=options.csv_dialect)
 
-    exec("f = lambda r: %s" % statement, locals())
-
+    exec("f = lambda r: %s" % statement, globals())
     counter = E.Counter()
     writer = csv.DictWriter(options.stdout,
                             reader.fieldnames,

From dd5348516efd4774cafab0da12ca73dee84871da Mon Sep 17 00:00:00 2001
From: Andreas Heger <andreas.heger@gmail.com>
Date: Mon, 22 May 2017 16:31:02 +0100
Subject: [PATCH 08/26] {AH} open temp file in text mode

---
 CGAT/scripts/runExpression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CGAT/scripts/runExpression.py b/CGAT/scripts/runExpression.py
index 8d763768..9f30713c 100644
--- a/CGAT/scripts/runExpression.py
+++ b/CGAT/scripts/runExpression.py
@@ -284,7 +284,7 @@ def main(argv=None):
     (options, args) = E.Start(parser, argv=argv, add_output_options=True)
 
     if options.input_filename_tags == "-":
-        fh = tempfile.NamedTemporaryFile(delete=False)
+        fh = tempfile.NamedTemporaryFile(delete=False, mode="w+t")
         fh.write("".join([x for x in options.stdin]))
         fh.close()
         options.input_filename_tags = fh.name

From 8e6580af9f4ba9a97351912b45fa2b3e8bcf7b32 Mon Sep 17 00:00:00 2001
From: Andreas Heger <andreas.heger@gmail.com>
Date: Mon, 22 May 2017 16:56:15 +0100
Subject: [PATCH 09/26] {AH} decode subprocess output for parsing

---
 CGAT/Masker.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/CGAT/Masker.py b/CGAT/Masker.py
index 12b54853..2582fd28 100644
--- a/CGAT/Masker.py
+++ b/CGAT/Masker.py
@@ -141,10 +141,11 @@ def maskSequence(self, peptide_sequence):
     def maskSequences(self, sequences):
         '''mask a collection of sequences.'''
 
-        with tempfile.NamedTemporaryFile(mode="w+t", delete=False) as infile:
+        with tempfile.NamedTemporaryFile(mode="w+t", delete=False) as outf:
             for x, s in enumerate(sequences):
-                infile.write(">%i\n%s\n" % (x, s))
+                outf.write(">%i\n%s\n" % (x, s))
 
+        infile = outf.name
         statement = self.mCommand % locals()
 
         E.debug("statement: %s" % statement)
@@ -161,11 +162,11 @@ def maskSequences(self, sequences):
             raise RuntimeError(
                 "Error in running %s \n%s\nTemporary directory" %
                 (statement, err))
-
+        
         result = [
-            x.sequence for x in FastaIterator.iterate(StringIO(out))]
+            x.sequence for x in FastaIterator.iterate(StringIO(out.decode()))]
 
-        os.remove(infile.name)
+        os.remove(infile)
 
         return result
 

From 8b3ffa52e825d144a5dda7ef7353f2562e3f3e64 Mon Sep 17 00:00:00 2001
From: MikeDMorgan <michael.morgan@dpag.ox.ac.uk>
Date: Sun, 2 Apr 2017 08:53:38 +0100
Subject: [PATCH 10/26] add cassi results format parsing

---
 CGAT/PipelineGWAS.py          | 43 +++++++++++++++++++++++++-------
 CGAT/scripts/assoc2plot.py    | 14 ++++++++---
 CGAT/scripts/extract_stats.py | 46 ++++++++++++++++++++++++++++++++---
 R/epistasis_covar.R           | 18 +++++++++-----
 4 files changed, 99 insertions(+), 22 deletions(-)

diff --git a/CGAT/PipelineGWAS.py b/CGAT/PipelineGWAS.py
index 8ec1945a..47a1811c 100755
--- a/CGAT/PipelineGWAS.py
+++ b/CGAT/PipelineGWAS.py
@@ -2184,9 +2184,10 @@ def parse_genome_wide(self, association_files):
         return df
 
     def get_results(self, association_file,
-                    epistasis=False):
+                    epistasis=False,
+                    file_format="plink"):
         '''
-        Parse a GWA results file and return the table
+        Parse a GWA or epistasis results file and return the table
         '''
 
         # use Pandas for now - try something different later
@@ -2216,16 +2217,40 @@ def get_results(self, association_file,
                                               index_col=None)
             except StopIteration:
                 results_frame = pd.read_table(association_file,
-                                              sep="\t", header=None,
+                                              sep="\t", header=0,
                                               index_col=None)
 
             # results from fast epistasis are different to others
-            if results_frame.shape[1] == 7:
-                results_frame.columns = ["CHR1", "SNP1", "CHR",
-                                         "SNP", "OR", "STAT", "P"]
-            else:
-                results_frame.columns = ["CHR", "SNP", "BP", "A1", "OR",
-                                         "SE", "STAT", "P"]
+            if file_format == "cassi_covar":
+                if results_frme.shape[1] == 12:
+                    results_frame.columns = ["SNP1", "CHR1", "ID1", "BP1",
+                                             "SNP2", "CHR2", "ID2", "BP2",
+                                             "OR", "SE", "STAT", "P"]
+                elif results_frame.shape[1] == 14:
+                    results_frame.columns = ["SNP1", "CHR1", "ID1", "BP1",
+                                             "SNP2", "CHR2", "ID2", "BP2",
+                                             "OR", "SE", "STAT", "P",
+                                             "CASE_RSQ", "CTRL_RSQ"]
+                elif results_frame.shape[1] == 16:
+                    results_frame.columns = ["SNP1", "CHR1", "ID1", "BP",
+                                             "SNP2", "CHR2", "ID2", "BP2",
+                                             "OR", "SE", "STAT", "P",
+                                             "CASE_RSQ", "CTRL_RSQ",
+                                             "CASE_DPRIME" "CTRL_DPRIME"]
+                results_frame.loc[:, "BP"] = pd.to_numeric(results_frame["BP"],
+                                                            errors="coerce")
+            elif file_format == "cassi":
+                pass
+            elif file_format == "plink":
+                if results_frame.shape[1] == 7:
+                    results_frame.columns = ["CHR1", "SNP1", "CHR",
+                                             "SNP", "OR", "STAT", "P"]
+                elif results_frame.shape[1] == 9:
+                    results_frame.columns = ["CHR", "SNP", "BP", "A1", "NMISS",
+                                             "OR", "SE", "STAT", "P"]
+                else:
+                    results_frame.columns = ["CHR", "SNP", "BP", "A1", "OR",
+                                             "SE", "STAT", "P"]
 
                 results_frame.loc[:, "BP"] = pd.to_numeric(results_frame["BP"],
                                                            errors="coerce")
diff --git a/CGAT/scripts/assoc2plot.py b/CGAT/scripts/assoc2plot.py
index c95813cc..ad6c3c65 100755
--- a/CGAT/scripts/assoc2plot.py
+++ b/CGAT/scripts/assoc2plot.py
@@ -61,6 +61,11 @@ def main(argv=None):
                       "depicts the whole genome, a single chromosome or "
                       "a specific locus")
 
+    parser.add_option("--file-format", dest="file_format", type="choice",
+                      choices=["plink", "cassi", "cassi_covar"],
+                      help="input file format, used to parse the file "
+                      "properly")
+    
     parser.add_option("--save-path", dest="save_path", type="string",
                       help="path and filename to save image to")
 
@@ -68,7 +73,8 @@ def main(argv=None):
     (options, args) = E.Start(parser, argv=argv)
 
     parser.set_defaults(resolution="genome_wide",
-                        plot_type="manhattan")
+                        plot_type="manhattan",
+                        file_format="plink")
 
     # if the input is a list of files, split them
     infile = argv[-1]
@@ -82,10 +88,12 @@ def main(argv=None):
 
     if len(infiles) > 1:
         results = gwas.GWASResults(assoc_file=infiles,
-                                   epistasis=epi)
+                                   epistasis=epi,
+                                   file_format=options.file_format)
     elif len(infiles) == 1:
         results = gwas.GWASResults(assoc_file=infile,
-                                   epistasis=epi)
+                                   epistasis=epi,
+                                   file_format=options.file_format)
     else:
         raise IOError("no input files detected, please specifiy association "
                       "results files as the last command line argument")
diff --git a/CGAT/scripts/extract_stats.py b/CGAT/scripts/extract_stats.py
index 41139298..623865a6 100644
--- a/CGAT/scripts/extract_stats.py
+++ b/CGAT/scripts/extract_stats.py
@@ -52,15 +52,35 @@
 import numpy as np
 import re
 import sqlite3 as sql
+from sqlalchemy import *
+import MySQLdb
 
 
-def getTableFromDb(db, table):
+def getTableFromDb(db, table, backend,
+                   username, db_hostname,
+                   db_port):
     '''
     Get a table from a database with pandas
     '''
 
-    state = ''' SELECT * FROM %(table)s;''' % locals()
-    dbh = sql.connect(db)
+    state = text(''' SELECT * FROM %(table)s;''' % locals())
+    if backend == "sqlite":
+        create_string = "sqlite:///{}".format(db)
+    elif backend == "mysql":
+        dbhandle = MySQLdb.connect(host=db_hostname,
+                                   user=username,
+                                   passwd="",
+                                   port=db_port,
+                                   db=db)
+        
+        #create_string = '''mysql://cgat@gandalf.anat.ox.ac.uk/%(db)s''' % locals()
+    else:
+        raise TypeError("database backend not recognised")
+
+    #engine = create_engine(create_string)
+    #dbh = engine.connect()
+    #dbh = sql.connect(db)
+    
     df = pdsql.read_sql(state, dbh)
     df.index = df["track"]
     df.drop(labels="track", inplace=True,
@@ -219,6 +239,19 @@ def main(argv=None):
     parser.add_option("-d", "--database", dest="database", type="string",
                       help="SQLite database containing relevant tables")
 
+    parser.add_option("--database-backend", dest="database_backend",
+                      type="choice", choices=["sqlite", "mysql"],
+                      help="database backend to use")
+
+    parser.add_option("--database-username", dest="database_username",
+                      type="string", help="MySQL username")
+
+    parser.add_option("--database-hostname", dest="database_hostname",
+                      type="string", help="MySQL server hostname")
+
+    parser.add_option("--database-port", dest="databse_port",
+                      type="string", help="MySQL server port")
+
     parser.add_option("-t", "--table-name", dest="table", type="string",
                       help="table in SQLite DB to extract")
 
@@ -226,8 +259,13 @@ def main(argv=None):
     (options, args) = E.Start(parser, argv=argv)
 
     if options.task == "extract_table":
+        print options.database_hostname
         out_df = getTableFromDb(db=options.database,
-                                table=options.table)
+                                table=options.table,
+                                backend=options.database_backend,
+                                username=options.database_username,
+                                db_hostname=options.database_hostname,
+                                db_port=int(options.databse_port))
 
     elif options.task == "get_coverage":
         out_df = getModelCoverage(db=options.database,
diff --git a/R/epistasis_covar.R b/R/epistasis_covar.R
index b0da405c..87eda449 100644
--- a/R/epistasis_covar.R
+++ b/R/epistasis_covar.R
@@ -8,15 +8,21 @@ Rplink <- function(PHENO, GENO, CLUSTER, COVAR){
   epi.log <- function(snp){
     # covar positions are based on the input data table,
     # not the position they are passed in!
-    # Assume SNP is the first column
-    test.snp = COVAR[, 1]
-    covars = COVAR[,(2:dim(COVAR)[2])]
-    m <- glm(PHENO == 2 ~ snp + covars + test.snp*snp, family=binomial(link="logit"))
+    # Assume SNP is the last column
+    n.covar <- dim(COVAR)[2]
+    test.snp <- COVAR[, n.covar]
+    if(dim(COVAR)[2] > 2){
+    	covars <- COVAR[, 1:(n.covar - 1)]
+    }
+    else {
+        covars <- as.matrix(COVAR[, 2], ncol=1)
+    }
+    m <- glm(PHENO == 2 ~ covars + test.snp + snp + test.snp*snp, family=binomial(link="logit"))
     len <- dim(summary(m)$coefficients)[1]
     sum.snp <- summary(m)$coefficients[len,]
     ors <- exp(sum.snp[1])
-    r <- c(ors, sum.snp[2], sum.snp[3], sum.snp[4])
-    names(r) <- c("OR", "SE", "STAT", "P")
+    r <- c(dim(covars)[1], ors, sum.snp[2], sum.snp[3], sum.snp[4])
+    names(r) <- c("NMISS", "OR", "SE", "STAT", "P")
     
     return(c(length(r), r))
   }

From e06c9f252c8c9c5e852098937fa6bf321a0ce9ef Mon Sep 17 00:00:00 2001
From: MikeDMorgan <michael.morgan@dpag.ox.ac.uk>
Date: Mon, 22 May 2017 20:43:10 +0100
Subject: [PATCH 11/26] add correct extract_stats.py for pipeline tests

---
 CGAT/scripts/extract_stats.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/CGAT/scripts/extract_stats.py b/CGAT/scripts/extract_stats.py
index 623865a6..23ff7bfe 100644
--- a/CGAT/scripts/extract_stats.py
+++ b/CGAT/scripts/extract_stats.py
@@ -63,9 +63,11 @@ def getTableFromDb(db, table, backend,
     Get a table from a database with pandas
     '''
 
-    state = text(''' SELECT * FROM %(table)s;''' % locals())
+    state = '''SELECT * FROM %(table)s''' % locals()
+    # state = text('''SELECT * FROM %(table)s;''' % locals())
     if backend == "sqlite":
         create_string = "sqlite:///{}".format(db)
+        dbh = sql.connect(db)
     elif backend == "mysql":
         dbhandle = MySQLdb.connect(host=db_hostname,
                                    user=username,
@@ -79,9 +81,8 @@ def getTableFromDb(db, table, backend,
 
     #engine = create_engine(create_string)
     #dbh = engine.connect()
-    #dbh = sql.connect(db)
     
-    df = pdsql.read_sql(state, dbh)
+    df = pdsql.read_sql(sql=state, con=dbh)
     df.index = df["track"]
     df.drop(labels="track", inplace=True,
             axis=1)
@@ -259,7 +260,6 @@ def main(argv=None):
     (options, args) = E.Start(parser, argv=argv)
 
     if options.task == "extract_table":
-        print options.database_hostname
         out_df = getTableFromDb(db=options.database,
                                 table=options.table,
                                 backend=options.database_backend,

From 739078b1228aee7b7746959ce6103e56e7a09782 Mon Sep 17 00:00:00 2001
From: MikeDMorgan <michael.morgan@dpag.ox.ac.uk>
Date: Mon, 22 May 2017 22:50:37 +0100
Subject: [PATCH 12/26] fix pep8 error

---
 CGAT/PipelineGWAS.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CGAT/PipelineGWAS.py b/CGAT/PipelineGWAS.py
index 47a1811c..002d8db3 100755
--- a/CGAT/PipelineGWAS.py
+++ b/CGAT/PipelineGWAS.py
@@ -2238,7 +2238,7 @@ def get_results(self, association_file,
                                              "CASE_RSQ", "CTRL_RSQ",
                                              "CASE_DPRIME" "CTRL_DPRIME"]
                 results_frame.loc[:, "BP"] = pd.to_numeric(results_frame["BP"],
-                                                            errors="coerce")
+                                                           errors="coerce")
             elif file_format == "cassi":
                 pass
             elif file_format == "plink":

From a2d0f2fe5ffefcc378a617ed601b1a603e5b2c75 Mon Sep 17 00:00:00 2001
From: Andreas Heger <andreas.heger@gmail.com>
Date: Tue, 23 May 2017 13:46:11 +0100
Subject: [PATCH 13/26] {AH} add GTF_test

---
 tests/GTF_test.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 tests/GTF_test.py

diff --git a/tests/GTF_test.py b/tests/GTF_test.py
new file mode 100644
index 00000000..52179515
--- /dev/null
+++ b/tests/GTF_test.py
@@ -0,0 +1,21 @@
+import unittest
+import os
+
+import CGAT.GTF as GTF
+import CGAT.IOTools as IOTools
+
+class TestIteration(unittest.TestCase):
+
+    filename = os.path.join("data", "hg19.small.gtf.gz")
+
+    def test_number_of_intervals_is_correct(self):
+        
+        with IOTools.openFile(self.filename) as inf:
+            records = list(GTF.iterator(inf))
+        
+        self.assertEqual(len(records),
+                         100)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 35be2f385d893560dc5187f242b046ac9efd2048 Mon Sep 17 00:00:00 2001
From: Andreas Heger <andreas.heger@gmail.com>
Date: Tue, 23 May 2017 13:50:16 +0100
Subject: [PATCH 14/26] {AH}

---
 CGAT/FastaIterator.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/CGAT/FastaIterator.py b/CGAT/FastaIterator.py
index f8508929..d941eade 100644
--- a/CGAT/FastaIterator.py
+++ b/CGAT/FastaIterator.py
@@ -92,7 +92,6 @@ def iterate(infile, comment="#", fold=False):
     ------
     FastaRecord
     '''
-
     h = infile.readline()[:-1]
 
     if not h:

From f4b878c2acd0c9936dc4c0e971c5506ab1359a2b Mon Sep 17 00:00:00 2001
From: Andreas Heger <andreas.heger@gmail.com>
Date: Tue, 23 May 2017 13:56:29 +0100
Subject: [PATCH 15/26] {AH} open file in text mode

---
 CGAT/scripts/cgat_fasta2cDNA.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/CGAT/scripts/cgat_fasta2cDNA.py b/CGAT/scripts/cgat_fasta2cDNA.py
index 3decf5c4..7aa7cd5e 100644
--- a/CGAT/scripts/cgat_fasta2cDNA.py
+++ b/CGAT/scripts/cgat_fasta2cDNA.py
@@ -1,18 +1,12 @@
 '''
-cgat_fasta2cDNA.py - template for CGAT scripts
-====================================================
+cgat_fasta2cDNA.py - converting multi-fasta of exon features into a multi-fasta of spliced cDNAs/RNAs
+======================================================================================================
 
-:Author:
-:Release: $Id$
-:Date: |today|
 :Tags: Python
 
 Purpose
 -------
 
-.. Mike transcript processing - converting multi-fasta of exon
-features into a multi-fasta of spliced cDNAs/RNAs
-
 Usage
 -----
 
@@ -45,7 +39,7 @@ def makeSplicedFasta(infile):
     '''
 
     fasta_dict = {}
-    with IOTools.openFile(infile, "rb") as fafile:
+    with IOTools.openFile(infile) as fafile:
         for line in fafile.readlines():
             if line[0] == '>':
                 header = line.rstrip("\n")

From d706a14c19f30eaad0513e6ec73ab0edcf7c3a9f Mon Sep 17 00:00:00 2001
From: Andreas Heger <andreas.heger@gmail.com>
Date: Tue, 23 May 2017 16:43:16 +0100
Subject: [PATCH 16/26] {AH} py3 fixes

---
 CGAT/scripts/extract_stats.py | 6 ++++--
 tests/GTF_test.py             | 5 +++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/CGAT/scripts/extract_stats.py b/CGAT/scripts/extract_stats.py
index 23ff7bfe..d1799cec 100644
--- a/CGAT/scripts/extract_stats.py
+++ b/CGAT/scripts/extract_stats.py
@@ -95,9 +95,11 @@ def cleanStatsTable(stats_file):
     Take in a table containing aggregated stats
     and clean by removing duplicate columns
     '''
-
+    # , mangle_dupe_cols=False)
+    # AH: disabled, because "ValueError: Setting mangle_dupe_cols=False is not supported yet"
     _df = pd.read_table(stats_file, sep="\t", header=0,
-                        index_col=None, mangle_dupe_cols=False)
+                        index_col=None)
+
     # drop duplicates is case sensitive, convert all to
     # same case - SQL is not case sensitive so will throw
     # a hissy fit for same column names in different cases
diff --git a/tests/GTF_test.py b/tests/GTF_test.py
index 52179515..7990c7d2 100644
--- a/tests/GTF_test.py
+++ b/tests/GTF_test.py
@@ -4,15 +4,16 @@
 import CGAT.GTF as GTF
 import CGAT.IOTools as IOTools
 
+
 class TestIteration(unittest.TestCase):
 
     filename = os.path.join("data", "hg19.small.gtf.gz")
 
     def test_number_of_intervals_is_correct(self):
-        
+
         with IOTools.openFile(self.filename) as inf:
             records = list(GTF.iterator(inf))
-        
+
         self.assertEqual(len(records),
                          100)
 

From f03c2112cd8022305d1e95a48e7ec94cbdc88291 Mon Sep 17 00:00:00 2001
From: Andreas Heger <andreas.heger@gmail.com>
Date: Mon, 5 Jun 2017 10:38:52 +0100
Subject: [PATCH 17/26] {AH} fix output sort order

---
 CGAT/scripts/cgat_fasta2cDNA.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CGAT/scripts/cgat_fasta2cDNA.py b/CGAT/scripts/cgat_fasta2cDNA.py
index 7aa7cd5e..1b2593cc 100644
--- a/CGAT/scripts/cgat_fasta2cDNA.py
+++ b/CGAT/scripts/cgat_fasta2cDNA.py
@@ -47,7 +47,7 @@ def makeSplicedFasta(infile):
             else:
                 fasta_dict[header] += line.rstrip("\n")
 
-    for key, value in fasta_dict.items():
+    for key, value in sorted(fasta_dict.items()):
         yield "%s\n%s\n" % (key, value)
 
 

From 22f85982b7f1962118ff2aecbbdc9fd114bfeeca Mon Sep 17 00:00:00 2001
From: Sebastian Luna Valero <sebastian.luna.valero@gmail.com>
Date: Thu, 8 Jun 2017 09:24:34 +0100
Subject: [PATCH 18/26]  remove Py2 testing in Py3 branch

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index f991b6e9..eb0b8f64 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -20,7 +20,7 @@ env:
     - TEST_ALL=1
 
 python:
-  - "2.7"
+#  - "2.7"
   - "3.5"
 
 # Using xvfb to Run Tests That Require a GUI

From 34ef3666fc2c7b54b41da5fa6fd8a46c8da89aca Mon Sep 17 00:00:00 2001
From: Sebastian Luna Valero <sebastian.luna.valero@gmail.com>
Date: Thu, 8 Jun 2017 09:43:37 +0100
Subject: [PATCH 19/26] force icu update

---
 install-CGAT-tools.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install-CGAT-tools.sh b/install-CGAT-tools.sh
index 53429557..58ceb963 100755
--- a/install-CGAT-tools.sh
+++ b/install-CGAT-tools.sh
@@ -292,7 +292,7 @@ conda create -q -n $CONDA_INSTALL_ENV $CONDA_INSTALL_TYPE python=$INSTALL_PYTHON
 
 # SLV brute force until pysam problem is solved
 conda uninstall -n $CONDA_INSTALL_ENV pysam --override-channels --channel conda-forge --channel defaults --channel r --channel bioconda --yes
-conda install -n $CONDA_INSTALL_ENV pysam=0.11.1 pybedtools --override-channels --channel conda-forge --channel defaults --channel r --channel bioconda --yes
+conda install -n $CONDA_INSTALL_ENV pysam=0.11.1 pybedtools icu=58 --override-channels --channel conda-forge --channel defaults --channel r --channel bioconda --yes
 
 log "installing CGAT code into conda environment"
 # if installation is 'devel' (outside of travis), checkout latest version from github

From d0a9d3696a0b503f095d1b6c0fd9f8186405f11c Mon Sep 17 00:00:00 2001
From: Sebastian Luna Valero <sebastian.luna.valero@gmail.com>
Date: Thu, 8 Jun 2017 10:15:19 +0100
Subject: [PATCH 20/26]  try different channel order for conda

---
 install-CGAT-tools.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/install-CGAT-tools.sh b/install-CGAT-tools.sh
index 58ceb963..763fd5a2 100755
--- a/install-CGAT-tools.sh
+++ b/install-CGAT-tools.sh
@@ -291,8 +291,8 @@ log "installing conda CGAT environment"
 conda create -q -n $CONDA_INSTALL_ENV $CONDA_INSTALL_TYPE python=$INSTALL_PYTHON_VERSION gcc --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes
 
 # SLV brute force until pysam problem is solved
-conda uninstall -n $CONDA_INSTALL_ENV pysam --override-channels --channel conda-forge --channel defaults --channel r --channel bioconda --yes
-conda install -n $CONDA_INSTALL_ENV pysam=0.11.1 pybedtools icu=58 --override-channels --channel conda-forge --channel defaults --channel r --channel bioconda --yes
+conda uninstall -n $CONDA_INSTALL_ENV pysam --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes
+conda install -n $CONDA_INSTALL_ENV pysam=0.11.1 pybedtools --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes
 
 log "installing CGAT code into conda environment"
 # if installation is 'devel' (outside of travis), checkout latest version from github

From edae8e020bb0760dcaf361193e7e1245ad90a67c Mon Sep 17 00:00:00 2001
From: Sebastian Luna Valero <sebastian.luna.valero@gmail.com>
Date: Thu, 8 Jun 2017 10:30:31 +0100
Subject: [PATCH 21/26]  try different conda command

---
 install-CGAT-tools.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/install-CGAT-tools.sh b/install-CGAT-tools.sh
index 763fd5a2..d72eb0fa 100755
--- a/install-CGAT-tools.sh
+++ b/install-CGAT-tools.sh
@@ -288,11 +288,12 @@ conda update -q conda --yes
 conda info -a
 
 log "installing conda CGAT environment"
-conda create -q -n $CONDA_INSTALL_ENV $CONDA_INSTALL_TYPE python=$INSTALL_PYTHON_VERSION gcc --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes
+#conda create -q -n $CONDA_INSTALL_ENV $CONDA_INSTALL_TYPE python=$INSTALL_PYTHON_VERSION gcc --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes
+conda create -q -n $CONDA_INSTALL_ENV $CONDA_INSTALL_TYPE python=$INSTALL_PYTHON_VERSION pysam=0.11.1 r=3.3.1 gcc --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes
 
 # SLV brute force until pysam problem is solved
-conda uninstall -n $CONDA_INSTALL_ENV pysam --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes
-conda install -n $CONDA_INSTALL_ENV pysam=0.11.1 pybedtools --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes
+#conda uninstall -n $CONDA_INSTALL_ENV pysam --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes
+#conda install -n $CONDA_INSTALL_ENV pysam=0.11.1 pybedtools --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes
 
 log "installing CGAT code into conda environment"
 # if installation is 'devel' (outside of travis), checkout latest version from github

From 27f65e0e8a79a98c71de2d0bf128337ae2a9f3cd Mon Sep 17 00:00:00 2001
From: Sebastian Luna Valero <sebastian.luna.valero@gmail.com>
Date: Thu, 8 Jun 2017 10:54:24 +0100
Subject: [PATCH 22/26]  try Py2 testing again

---
 .travis.yml           | 2 +-
 install-CGAT-tools.sh | 8 +++-----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index eb0b8f64..f991b6e9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -20,7 +20,7 @@ env:
     - TEST_ALL=1
 
 python:
-#  - "2.7"
+  - "2.7"
   - "3.5"
 
 # Using xvfb to Run Tests That Require a GUI
diff --git a/install-CGAT-tools.sh b/install-CGAT-tools.sh
index d72eb0fa..95cba8f2 100755
--- a/install-CGAT-tools.sh
+++ b/install-CGAT-tools.sh
@@ -288,13 +288,11 @@ conda update -q conda --yes
 conda info -a
 
 log "installing conda CGAT environment"
-#conda create -q -n $CONDA_INSTALL_ENV $CONDA_INSTALL_TYPE python=$INSTALL_PYTHON_VERSION gcc --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes
+# SLV workaround until problems are resolved for:
+# * Latest R 3.3.2 available in conda: Problems with icu here https://travis-ci.org/CGATOxford/cgat/builds/240711411
+# * Latest pysam works with the CGAT code: the gff3 support is broken, best not to use 0.11.2 for now (AH comment)
 conda create -q -n $CONDA_INSTALL_ENV $CONDA_INSTALL_TYPE python=$INSTALL_PYTHON_VERSION pysam=0.11.1 r=3.3.1 gcc --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes
 
-# SLV brute force until pysam problem is solved
-#conda uninstall -n $CONDA_INSTALL_ENV pysam --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes
-#conda install -n $CONDA_INSTALL_ENV pysam=0.11.1 pybedtools --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes
-
 log "installing CGAT code into conda environment"
 # if installation is 'devel' (outside of travis), checkout latest version from github
 if [[ "$OS" != "travis" ]] ; then

From 6399552f163df205be1dcec383149708b9cb4546 Mon Sep 17 00:00:00 2001
From: Sebastian Luna Valero <sebastian.luna.valero@gmail.com>
Date: Thu, 8 Jun 2017 11:07:43 +0100
Subject: [PATCH 23/26]  Py2's open() function does not support the 'encoding'
 argument so removing Py2 tests from now on

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index f991b6e9..eb0b8f64 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -20,7 +20,7 @@ env:
     - TEST_ALL=1
 
 python:
-  - "2.7"
+#  - "2.7"
   - "3.5"
 
 # Using xvfb to Run Tests That Require a GUI

From 007e34d23185e9e996c47ec01c1cade05ea495ea Mon Sep 17 00:00:00 2001
From: Sebastian Luna Valero <sebastian.luna.valero@gmail.com>
Date: Thu, 8 Jun 2017 11:26:53 +0100
Subject: [PATCH 24/26]  add important reference about issue with R 3.3.2 and
 icu library

---
 install-CGAT-tools.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/install-CGAT-tools.sh b/install-CGAT-tools.sh
index 95cba8f2..7e0fbfc9 100755
--- a/install-CGAT-tools.sh
+++ b/install-CGAT-tools.sh
@@ -289,7 +289,7 @@ conda info -a
 
 log "installing conda CGAT environment"
 # SLV workaround until problems are resolved for:
-# * Latest R 3.3.2 available in conda: Problems with icu here https://travis-ci.org/CGATOxford/cgat/builds/240711411
+# * Latest R 3.3.2 available in conda: Problems with icu here https://travis-ci.org/CGATOxford/cgat/builds/240711411 and here https://github.com/ContinuumIO/anaconda-issues/issues/1403
 # * Latest pysam works with the CGAT code: the gff3 support is broken, best not to use 0.11.2 for now (AH comment)
 conda create -q -n $CONDA_INSTALL_ENV $CONDA_INSTALL_TYPE python=$INSTALL_PYTHON_VERSION pysam=0.11.1 r=3.3.1 gcc --override-channels --channel bioconda --channel r --channel defaults --channel conda-forge --yes
 

From 3ac8b568329e13fba5574a3786776e69338a3879 Mon Sep 17 00:00:00 2001
From: Charlie-George <charlotte.george@dpag.ox.ac.uk>
Date: Tue, 27 Jun 2017 19:08:43 +0100
Subject: [PATCH 25/26] changed float to integer division for shift calculation
 for py3 compatability

---
 CGAT/scripts/bed2table.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CGAT/scripts/bed2table.py b/CGAT/scripts/bed2table.py
index 5841f456..3a1cb26d 100644
--- a/CGAT/scripts/bed2table.py
+++ b/CGAT/scripts/bed2table.py
@@ -239,7 +239,7 @@ def _count(self, bed, bamfiles, offsets):
             # if offsets are given, shift tags.
             for samfile, offset in zip(bamfiles, offsets):
 
-                shift = offset / 2
+                shift = offset // 2
                 # for peak counting I follow the MACS protocoll,
                 # see the function def __tags_call_peak in PeakDetect.py
                 # In words

From 56ea408990877f7466795bdd55b194e4bd108eaa Mon Sep 17 00:00:00 2001
From: Andreas Heger <andreas.heger@gmail.com>
Date: Mon, 26 Jun 2017 19:55:24 +0100
Subject: [PATCH 26/26] {AH} py3-fix for getLastLine

---
 CGAT/IOTools.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/CGAT/IOTools.py b/CGAT/IOTools.py
index 47b49a2a..099a9e99 100644
--- a/CGAT/IOTools.py
+++ b/CGAT/IOTools.py
@@ -68,7 +68,7 @@ def getFirstLine(filename, nlines=1):
     return line
 
 
-def getLastLine(filename, nlines=1, read_size=1024):
+def getLastLine(filename, nlines=1, read_size=1024, encoding="utf-8"):
     """return the last line of a file.
 
     This method works by working back in blocks of `read_size` until
@@ -90,8 +90,8 @@ def getLastLine(filename, nlines=1, read_size=1024):
 
     """
 
-    # U is to open it with Universal newline support
-    f = open(filename, 'rU')
+    # py3 requires binary mode for negative seeks
+    f = open(filename, 'rb')
     offset = read_size
     f.seek(0, 2)
     file_size = f.tell()
@@ -102,10 +102,8 @@ def getLastLine(filename, nlines=1, read_size=1024):
             offset = file_size
         f.seek(-1 * offset, 2)
         read_str = f.read(offset)
-        # Remove newline at the end
-        if read_str[offset - 1] == '\n':
-            read_str = read_str[:-1]
-        lines = read_str.split('\n')
+        read_str = read_str.decode(encoding)
+        lines = read_str.strip().splitlines()
         if len(lines) >= nlines + 1:
             return "\n".join(lines[-nlines:])
         if offset == file_size:   # reached the beginning