BD2KGenomics · drkthomp · Aug 23, 2020 · Oct 6, 2021
diff --git a/.gitignore b/.gitignore
@@ -1,10 +1,12 @@
+bd2k-extras/
 pimmuno.py
 pimmuno_2.py
 *.pyc
 /src/*.egg-info/
 develop_data/
 venv/
 .cache/
+jobStore/
 test-report.xml
 __pycache__
 *.DONE

diff --git a/MANUAL.md b/MANUAL.md
@@ -27,87 +27,55 @@ ProTECT is implemented in the [Toil](https://github.com/BD2KGenomics/toil.git) f
 runs the workflow described in [protect/Flowchart.txt](
 https://github.com/BD2KGenomics/protect/blob/master/Flowchart.txt).
 
+**This manual is a quick adaptation for an adaptation of ProTECT to py3** 
+
 
 # Installation
 
 ProTECT requires Toil and we recommend installing ProTECT and its requirements in a
 [virtualenv](http://docs.python-guide.org/en/latest/dev/virtualenvs/).
 
-ProTECT also requires [s3am](https://github.com/BD2KGenomics/s3am.git) version 2.0.1 to download and
+~ProTECT also requires [s3am](https://github.com/BD2KGenomics/s3am.git) version 2.0.1 to download and
 upload files from S3. We recommend installing s3am in its own virtualenv using the directions in
 the s3am manual, then putting the s3am binary on your $PATH.  ProTECT will NOT attempt to install
-s3am during installation.
+s3am during installation.~ 
 
-ProTECT uses pkg_resources from setuptools to verify versions of tools during install. As of setuptools
-39.0.1, some modules were moved to the packaging module. If your machine has setuptools >=39.0.1, you
-will need the packaging module.
+currently WIP. for now, **only references to local files will work**. anything that requires access to s3am (s3 buckets) will **fail**. 
 
 Lastly, ProTECT uses [docker](https://www.docker.com/) to run the various sub-tools in a
 reproducible, platform independent manner. ProTECT will NOT attempt to install docker during
 installation.
 
-### Method 1 - Using PIP (recommended)
-
-First create a virtualenv at your desired location (Here we create it in the folder ~/venvs)
-
-    virtualenv ~/venvs/protect
-
-Activate the virtualenv
-
-    source ~/venvs/protect/bin/activate
-
-NOTE: Installation was tested using pip 7.1.2 and 8.1.1. We have seen issues with the installation
-of pyYAML with lower versions of pip and recommend upgrading pip before installing ProTECT.
-
-    pip install --upgrade pip
-
-Install Toil
-
-    pip install toil[aws]==3.5.2
-
-Install packaging (required if setuptools>=39.0.1)
-
-    pip install packaging
-
-Install ProTECT and all dependencies in the virtualenv
-
-    pip install protect
-
+~Method 1 - Using PIP (recommended)~
 ### Method 2 - Installing from Source
 
 This will install ProTECT in an editable mode.
 
 Obtain the source from Github
 
-    git clone https://www.github.com/BD2KGenomics/protect.git
+    git clone https://www.github.com/Dranion/protect.git
 
 Create and activate a virtualenv in the project folder (Important since the Makefile checks for
 this and will fail if it detects that you are not in a virtual environment)
 
     cd protect
-    virtualenv venv
+    virtualenv --python=python3 venv
     source venv/bin/activate
 
 Install Toil and pytest
 
     make prepare
 
-Install packaging (required if setuptools>=39.0.1)
+Install the python3 conversion of bd2k and s3am. *s3am is untested as I am running locally* 
 
-    pip install packaging
+    make special_install
 
 Install ProTECT
 
     make develop
 
-## Method 3 - Using Docker
+~Method 3 - Using Docker~
 
-Dockerized versions of ProTECT releases can be found at https://quay.io/organization/ucsc_cgl. These
-Docker containers run the ProTECT pipeline in single machine mode. The only difference between the
-Docker and Python versions of the pipeline is that the Docker container takes the config options,
-described below, as command line arguments as opposed to a config file. Running the container
-without any arguments will list all the available options. Also, currently the dockerized version of
-ProTECT only supports local file export.
 
 # Running ProTECT
 
@@ -173,7 +141,7 @@ in the pipeline, and the information on the input samples. Elements before a `:`
 dictionary read into ProTECT and should **NOT** be modified (Barring the patient ID key in the
 patients dictionary). Only values to the right of the `:` should be edited.
 
-Every required reference file is provided in the AWS bucket `cgl-pipeline-inputs` under the folder
+Every required reference file is provided in the AWS bucket `protect-data` under the folder
 `protect/hg19_references` or `protect/hg38_references`. The `README` file in the same location
 describes in detail how each file was generated. To use a file located in an s3 bucket, replace
 `/path/to` in the following descriptions with `s3://<databucket>/<folder_in_bucket>`.
@@ -547,7 +515,7 @@ purposes:
     12: g/f/jobO4yiE4        return self.run(fileStore)
     13: g/f/jobO4yiE4      File "/home/ucsc/arjun/tools/dev/toil_clean/src/toil/job.py", line 1406, in run
     14: g/f/jobO4yiE4        rValue = userFunction(*((self,) + tuple(self._args)), **self._kwargs)
-    15: g/f/jobO4yiE4      File "/home/ucsc/arjun/tools/protect_toil_clean/local/lib/python2.7/site-packages/protect/binding_prediction/common.py", line 566, in merge_mhc_peptide_calls
+    15: g/f/jobO4yiE4      File "/home/ucsc/arjun/tools/protect_toil_clean/local/lib/python3/site-packages/protect/binding_prediction/common.py", line 566, in merge_mhc_peptide_calls
     16: g/f/jobO4yiE4        raise RuntimeError('No peptides available for ranking')
     17: g/f/jobO4yiE4    RuntimeError: No peptides available for ranking
     18: g/f/jobO4yiE4    ERROR:toil.worker:Exiting the worker because of a failed job on host sjcb10st7
@@ -581,9 +549,9 @@ do not store logs from tools (see BD2KGenomics/protect#275). The error looks sim
     Z/O/job1uH92D        return self.run(fileStore)
     Z/O/job1uH92D      File "/home/ucsc/arjun/tools/dev/toil_clean/src/toil/job.py", line 1406, in run
     Z/O/job1uH92D        rValue = userFunction(*((self,) + tuple(self._args)), **self._kwargs)
-    Z/O/job1uH92D      File "/home/ucsc/arjun/tools/protect_toil_clean/local/lib/python2.7/site-packages/protect/mutation_calling/radia.py", line 238, in run_filter_radia
+    Z/O/job1uH92D      File "/home/ucsc/arjun/tools/protect_toil_clean/local/lib/python3/site-packages/protect/mutation_calling/radia.py", line 238, in run_filter_radia
     Z/O/job1uH92D        tool_version=radia_options['version'])
-    Z/O/job1uH92D      File "/home/ucsc/arjun/tools/protect_toil_clean/local/lib/python2.7/site-packages/protect/common.py", line 138, in docker_call
+    Z/O/job1uH92D      File "/home/ucsc/arjun/tools/protect_toil_clean/local/lib/python3/site-packages/protect/common.py", line 138, in docker_call
     Z/O/job1uH92D        'for command \"%s\"' % ' '.join(call),)
     Z/O/job1uH92D    RuntimeError: docker command returned a non-zero exit status (1)for command "docker run --rm=true -v /scratch/bio/ucsc/toil-681c097c-61da-4687-b734-c5051f0aa19f/tmped2fnu/f041f939-5c0d-40be-a884-68635e929d09:/data --log-driver=none aarjunrao/filterradia:bcda721fc1f9c28d8b9224c2f95c440759cd3a03 TCGA-CH-5788 17 /data/radia.vcf /data /home/radia/scripts -d /data/radia_dbsnp -r /data/radia_retrogenes -p /data/radia_pseudogenes -c /data/radia_cosmic -t /data/radia_gencode --noSnpEff --noBlacklist --noTargets --noRnaBlacklist -f /data/hg38.fa --log=INFO -g /data/radia_filtered_chr17_radia.log"
     Z/O/job1uH92D    ERROR:toil.worker:Exiting the worker because of a failed job on host sjcb10st1

diff --git a/Makefile b/Makefile
@@ -45,17 +45,22 @@ help:
 	@echo "$$help"
 
 
-python=python2.7
-pip=pip2.7
+python=python
+pip=pip
 tests=src/protect/test/unit
 extras=
-
 green=\033[0;32m
 normal=\033[0m
 red=\033[0;31m
 
+# WIP 
+special_install: check_venv
+	git clone https://github.com/Dranion/bd2k-extras.git
+	make -C bd2k-extras/bd2k-python-lib develop
+	make -C bd2k-extras/s3am develop
+
 prepare: check_venv
-	@$(pip) install toil==3.8.0 pytest==2.8.3
+	@$(pip) install toil pytest  
 
 develop: check_venv
 	$(pip) install -e .$(extras)
@@ -107,11 +112,10 @@ clean_pypi:
 
 clean: clean_develop clean_sdist clean_pypi
 
-
 check_venv:
-	@$(python) -c 'import sys; sys.exit( int( not hasattr(sys, "real_prefix") ) )' \
-		|| ( echo "$(red)A virtualenv must be active.$(normal)" ; false )
-
+	@$(python) -c 'import sys; sys.exit( int( not (hasattr(sys, "real_prefix") or ( hasattr(sys, "base_prefix") and sys.base_prefix != sys.prefix ) ) ) )' \
+		|| [ ! -z "${VIRTUAL_ENV}" ] \
+		|| ( echo "$(red)A virtualenv must be active.$(normal)\n" ; false )
 
 check_clean_working_copy:
 	@echo "$(green)Checking if your working copy is clean ...$(normal)"

diff --git a/README.md b/README.md
@@ -1,7 +1,10 @@
-[![Stories in Ready](https://badge.waffle.io/BD2KGenomics/protect.png?label=ready&title=Ready)](https://waffle.io/BD2KGenomics/protect)
 # ProTECT
 ### **Pr**ediction **o**f **T**-Cell **E**pitopes for **C**ancer **T**herapy
 
+Adapation of ProTECT to use python 3.8 instead of 2.7. Currently have tested a complete run using fastq files from [HCC1395 WGS Exome RNA Seq Data](https://github.com/genome/gms/wiki/HCC1395-WGS-Exome-RNA-Seq-Data), with identical results in both version of python. 
+
+Adaptation done using 2to3 and manual bug testing. Manual changes recorded [at changes.md](changes.md). Since s3am is python2, **currently is local only**, however an untested python3 version of s3am exists [here](https://github.com/Dranion/bd2k-extras/tree/main). Continuing to the original README: 
+
 This repo contains the Python libraries for the Precision Immunology Pipeline developed at UCSC.
 
     src/protect/pipeline/ProTECT.py             - The python script for running the pipeline.
@@ -20,6 +23,6 @@ All docker images used in this pipeline are available at
 
 
 To learn how the pipeline can be run on a sample, head over to the [ProTECT Manual](
-https://github.com/BD2KGenomics/protect/blob/master/MANUAL.md)
+https://github.com/Dranion/protect/blob/master/MANUAL.md)
 
 ProTECT is currently in its infancy and is under continuous development.  We would appreciate users sharing the level 3 data produced by ProTECT with us such that we can better train our predictive models.
diff --git a/attic/ProTECT.py b/attic/ProTECT.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python2.7
+#!/usr/bin/env python3
 # Copyright 2016 Arjun Arkal Rao
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,7 +20,7 @@
 Program info can be found in the docstring of the main function.
 Details can also be obtained by running the script with -h .
 """
-from __future__ import print_function
+
 
 import argparse
 import errno
@@ -33,7 +33,7 @@
 import time
 from collections import defaultdict, Counter
 from multiprocessing import cpu_count
-from urlparse import urlparse
+from urllib.parse import urlparse
 
 from pysam import Samfile
 
@@ -78,7 +78,7 @@ def parse_config_file(job, config_file):
         # along with it's parameters.
         for groupname, group_params in tool_specific_param_generator(job, conf):
             if groupname == 'patient':
-                if 'patient_id' not in group_params.keys():
+                if 'patient_id' not in list(group_params.keys()):
                     raise ParameterError('A patient group is missing the patient_id flag.')
                 sample_set[group_params['patient_id']] = group_params
             elif groupname == 'Universal_Options':
@@ -104,7 +104,7 @@ def parse_config_file(job, config_file):
         raise ParameterError(' The following tools have no arguments in the config file : \n' +
                              '\n'.join(missing_tools))
     # Start a job for each sample in the sample set
-    for patient_id in sample_set.keys():
+    for patient_id in list(sample_set.keys()):
         job.addFollowOnJobFn(pipeline_launchpad, sample_set[patient_id], univ_options, tool_options)
     return None
 
@@ -248,7 +248,7 @@ def delete_fastqs(job, fastqs):
             +- 'normal_dna': [<JSid for 1.fastq> , <JSid for 2.fastq>]
     """
     for fq_type in ['tumor_rna', 'tumor_dna', 'normal_dna']:
-        for i in xrange(0,2):
+        for i in range(0,2):
             job.fileStore.deleteGlobalFile(fastqs[fq_type][i])
     return None
 
@@ -727,7 +727,7 @@ def spawn_radia(job, rna_bam, tumor_bam, normal_bam, univ_options, radia_options
             'normal_dnai': normal_bam['normal_dna_fix_pg_sorted.bam.bai']}
     # Make a dict object to hold the return values for each of the chromosome jobs.  Then run radia
     # on each chromosome.
-    chromosomes = [''.join(['chr', str(x)]) for x in range(1, 23) + ['X', 'Y']]
+    chromosomes = [''.join(['chr', str(x)]) for x in list(range(1, 23)) + ['X', 'Y']]
     perchrom_radia = defaultdict()
     for chrom in chromosomes:
         perchrom_radia[chrom] = job.addChildJobFn(run_radia, bams, univ_options, radia_options,
@@ -755,11 +755,11 @@ def merge_radia(job, perchrom_rvs):
     work_dir = job.fileStore.getLocalTempDir()
     # We need to squash the input dict of dicts to a single dict such that it can be passed to
     # get_files_from_filestore
-    input_files = {filename: jsid for perchrom_files in perchrom_rvs.values()
-                   for filename, jsid in perchrom_files.items()}
+    input_files = {filename: jsid for perchrom_files in list(perchrom_rvs.values())
+                   for filename, jsid in list(perchrom_files.items())}
     input_files = get_files_from_filestore(job, input_files, work_dir,
                                            docker=False)
-    chromosomes = [''.join(['chr', str(x)]) for x in range(1, 23) + ['X', 'Y']]
+    chromosomes = [''.join(['chr', str(x)]) for x in list(range(1, 23)) + ['X', 'Y']]
     with open('/'.join([work_dir, 'radia_calls.vcf']), 'w') as radfile, \
             open('/'.join([work_dir, 'radia_filter_passing_calls.vcf']), 'w') as radpassfile:
         for chrom in chromosomes:
@@ -961,7 +961,7 @@ def spawn_mutect(job, tumor_bam, normal_bam, univ_options, mutect_options):
     job.fileStore.logToMaster('Running spawn_mutect on %s' % univ_options['patient'])
     # Make a dict object to hold the return values for each of the chromosome
     # jobs.  Then run mutect on each chromosome.
-    chromosomes = [''.join(['chr', str(x)]) for x in range(1, 23) + ['X', 'Y']]
+    chromosomes = [''.join(['chr', str(x)]) for x in list(range(1, 23)) + ['X', 'Y']]
     perchrom_mutect = defaultdict()
     for chrom in chromosomes:
         perchrom_mutect[chrom] = job.addChildJobFn(run_mutect, tumor_bam, normal_bam, univ_options,
@@ -987,10 +987,10 @@ def merge_mutect(job, perchrom_rvs):
     work_dir = job.fileStore.getLocalTempDir()
     # We need to squash the input dict of dicts to a single dict such that it can be passed to
     # get_files_from_filestore
-    input_files = {filename: jsid for perchrom_files in perchrom_rvs.values()
-                   for filename, jsid in perchrom_files.items()}
+    input_files = {filename: jsid for perchrom_files in list(perchrom_rvs.values())
+                   for filename, jsid in list(perchrom_files.items())}
     input_files = get_files_from_filestore(job, input_files, work_dir, docker=False)
-    chromosomes = [''.join(['chr', str(x)]) for x in range(1, 23) + ['X', 'Y']]
+    chromosomes = [''.join(['chr', str(x)]) for x in list(range(1, 23)) + ['X', 'Y']]
     with open('/'.join([work_dir, 'mutect_calls.vcf']), 'w') as mutvcf, \
             open('/'.join([work_dir, 'mutect_calls.out']), 'w') as mutout, \
             open('/'.join([work_dir, 'mutect_passing_calls.vcf']), 'w') as mutpassvcf:
@@ -1139,7 +1139,7 @@ def run_mutation_aggregator(job, fusion_output, radia_output, mutect_output, ind
     input_files.pop('fusion.vcf')
     # read files into memory
     vcf_file = defaultdict()
-    mutcallers = input_files.keys()
+    mutcallers = list(input_files.keys())
     with open(''.join([work_dir, '/', univ_options['patient'], '_merged_mutations.vcf']),
               'w') as merged_mut_file:
         for mut_caller in mutcallers:
@@ -1571,8 +1571,8 @@ def merge_mhc_peptide_calls(job, antigen_predictions, transgened_files):
     mhci_files = get_files_from_filestore(job, mhci_preds, work_dir)
     # First split mhcii_preds into prediction files and predictors and maintain keys so we can later
     # reference them in pairs
-    mhcii_predictors = {x: y[1] for x, y in mhcii_preds.items()}
-    mhcii_files = {x: y[0] for x, y in mhcii_preds.items()}
+    mhcii_predictors = {x: y[1] for x, y in list(mhcii_preds.items())}
+    mhcii_files = {x: y[0] for x, y in list(mhcii_preds.items())}
     mhcii_files = get_files_from_filestore(job, mhcii_files, work_dir)
     # Get peptide files
     pept_files = get_files_from_filestore(job, pept_files, work_dir)
@@ -1584,7 +1584,7 @@ def merge_mhc_peptide_calls(job, antigen_predictions, transgened_files):
         pepmap = json.load(mapfile)
     # Incorporate peptide names into the merged calls
     with open('/'.join([work_dir, 'mhci_merged_files.list']), 'w') as mhci_resfile:
-        for mhcifile in mhci_files.values():
+        for mhcifile in list(mhci_files.values()):
             with open(mhcifile, 'r') as mf:
                 for line in mf:
                     # Skip header lines
@@ -1605,7 +1605,7 @@ def merge_mhc_peptide_calls(job, antigen_predictions, transgened_files):
     # Incorporate peptide names into the merged calls
     with open('/'.join([work_dir, 'mhcii_merged_files.list']), 'w') as \
             mhcii_resfile:
-        for mhciifile in mhcii_files.keys():
+        for mhciifile in list(mhcii_files.keys()):
             core_col = None  # Variable to hold the column number with the core
             if mhcii_predictors[mhciifile] == 'Consensus':
                 with open(mhcii_files[mhciifile], 'r') as mf:
@@ -1814,7 +1814,7 @@ def prepare_samples(job, fastqs, univ_options):
                        'normal_dna_fastq_prefix'}
     if set(fastqs.keys()).difference(allowed_samples) != {'patient_id'}:
         raise ParameterError('Sample with the following parameters has an error:\n' +
-                             '\n'.join(fastqs.values()))
+                             '\n'.join(list(fastqs.values())))
     # For each sample type, check if the prefix is an S3 link or a regular file
     # Download S3 files.
     for sample_type in ['tumor_dna', 'tumor_rna', 'normal_dna']:
@@ -1877,7 +1877,7 @@ def get_files_from_filestore(job, files, work_dir, cache=True, docker=False):
     work_dir is the location where the file should be stored
     cache indiciates whether caching should be used
     """
-    for name in files.keys():
+    for name in list(files.keys()):
         outfile = job.fileStore.readGlobalFile(files[name], '/'.join([work_dir, name]), cache=cache)
         # If the file pointed to a tarball, extract it to WORK_DIR
         if tarfile.is_tarfile(outfile) and file_xext(outfile).startswith('.tar'):
@@ -1924,15 +1924,15 @@ def most_probable_alleles(allele_list):
         except KeyError:
             all_alleles[allele] = [float(pvalue)]
     # If there are less than 2 alleles, report all
-    if len(all_alleles.keys()) <= 2:
-        return all_alleles.keys()
+    if len(list(all_alleles.keys())) <= 2:
+        return list(all_alleles.keys())
     # Else, get the two with most evidence.  Evidence is gauged by
     # a) How many files (of the 3) thought that Allele was present
     # b) In a tie, who has a lower avg p value
     # In the lambda function, if 2 alleles have the same number of calls, the sum of the p values is
     # a measure of the avg because avg = sum / n and n is equal in both of them.
     else:
-        return sorted(all_alleles.keys(), key=lambda x: \
+        return sorted(list(all_alleles.keys()), key=lambda x: \
             (-len(all_alleles[x]), sum(all_alleles[x])))[0:2]
 
 
@@ -2111,7 +2111,7 @@ def print_mhc_peptide(neoepitope_info, peptides, pepmap, outfile):
 
     """
     allele, pept, pred, core = neoepitope_info
-    peptide_names = [x for x, y in peptides.items() if pept in y]
+    peptide_names = [x for x, y in list(peptides.items()) if pept in y]
     # For each peptide, append the ensembl gene
     for peptide_name in peptide_names:
         print(allele, pept, peptide_name, core, '0', pred, pepmap[peptide_name], sep='\t',
@@ -2514,7 +2514,7 @@ def strip_xext(filepath):
     :return str filepath: Path to the file with the compression extension stripped off.
     """
     ext_size = len(file_xext(filepath).split('.')) - 1
-    for i in xrange(0, ext_size):
+    for i in range(0, ext_size):
         filepath = os.path.splitext(filepath)[0]
     return filepath