Skip to content

Commit

Permalink
Updated Python and PyMC, removed TensorFlow, and added PyTorch in con…
Browse files Browse the repository at this point in the history
…da environment. (#8561)

* Updated Python and PyMC, removed TensorFlow, and added PyTorch in Docker and conda environments. Notable environment changes: python 3.6.10 -> 3.10.13, pymc 3.1 -> 5.10.0, theano 1.0.4 -> pytensor 2.18.1, added pytorch 2.1.0, removed tensorflow 1.15.0 and other CNN dependencies, added libblas-dev to the base Docker.

* Updated gCNV code to account for changes from PyMC3/Theano to PyMC/PyTensor.

* Updated gCNV integration tests.

* Updated gCNV WDL tests.

* Updated other tests and tools affected by environment changes.

* Reverted posterior sampling to online implementation.

* Updated localDevCondaEnv task in build.gradle.

* Addressed review comments and cleaned up TODOs.

* Released gatkbase-3.3.0 and updated Dockerfile.

* Added DeprecatedFeature tags to CNNScoreVariants, CNNVariantTrain, and CNNVariantWriteTensors.
  • Loading branch information
samuelklee authored Jul 9, 2024
1 parent 4af2b49 commit ddaf66f
Show file tree
Hide file tree
Showing 128 changed files with 2,563 additions and 2,616 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/gatk-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
wdlTest: [ 'RUN_CNV_GERMLINE_COHORT_WDL', 'RUN_CNV_GERMLINE_CASE_WDL', 'RUN_CNV_SOMATIC_WDL', 'RUN_M2_WDL', 'RUN_CNN_WDL', 'RUN_VCF_SITE_LEVEL_FILTERING_WDL' ]
wdlTest: [ 'RUN_CNV_GERMLINE_COHORT_WDL', 'RUN_CNV_GERMLINE_CASE_WDL', 'RUN_CNV_SOMATIC_WDL', 'RUN_M2_WDL', 'RUN_VCF_SITE_LEVEL_FILTERING_WDL' ]
continue-on-error: true
name: WDL test ${{ matrix.wdlTest }} on cromwell
steps:
Expand Down
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ARG BASE_DOCKER=broadinstitute/gatk:gatkbase-3.2.0
ARG BASE_DOCKER=broadinstitute/gatk:gatkbase-3.3.0

# stage 1 for constructing the GATK zip
FROM ${BASE_DOCKER} AS gradleBuild
Expand Down Expand Up @@ -85,7 +85,7 @@ ENV CLASSPATH=/gatk/gatk.jar:$CLASSPATH PATH=$CONDA_PATH/envs/gatk/bin:$CONDA_PA

# Start GATK Python environment

RUN conda env create -n gatk -f /gatk/gatkcondaenv.yml && \
RUN conda env create -vv -n gatk -f /gatk/gatkcondaenv.yml && \
echo "source activate gatk" >> /gatk/gatkenv.rc && \
echo "source /gatk/gatk-completion.sh" >> /gatk/gatkenv.rc && \
conda clean -afy && \
Expand Down
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -684,7 +684,7 @@ task localDevCondaEnv(type: Exec) {
dependsOn 'condaEnvironmentDefinition'
inputs.file("$buildDir/$pythonPackageArchiveName")
workingDir "$buildDir"
commandLine "conda", "env", "create", "--force", "-f", gatkCondaYML
commandLine "conda", "env", "create", "--yes", "-f", gatkCondaYML
}

task javadocJar(type: Jar, dependsOn: javadoc) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
"CNVGermlineCaseScatteredWorkflow.gcnv_max_training_epochs": 1,
"CNVGermlineCaseScatteredWorkflow.gcnv_min_training_epochs": 1,
"CNVGermlineCaseScatteredWorkflow.gcnv_model_tars": [
"/home/runner/work/gatk/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-0.tar.gz",
"/home/runner/work/gatk/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-1.tar.gz"],
"/home/runner/work/gatk/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-shard-0.tar.gz",
"/home/runner/work/gatk/gatk/src/test/resources/large/cnv_germline_workflows_test_files/wes-do-gc-gcnv-model-shard-1.tar.gz"],
"CNVGermlineCaseScatteredWorkflow.gcnv_num_thermal_advi_iters": 1,
"CNVGermlineCaseScatteredWorkflow.intervals": "/home/runner/work/gatk/gatk/src/test/resources/large/cnv_germline_workflows_test_files/ice_targets_chr20xy.interval_list",
"CNVGermlineCaseScatteredWorkflow.filtered_intervals": "/home/runner/work/gatk/gatk/src/test/resources/large/cnv_germline_workflows_test_files/ice_targets_chr20xy.preprocessed.filtered.interval_list",
Expand Down
1 change: 1 addition & 0 deletions scripts/docker/gatkbase/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ RUN apt update && \
git \
gpg-agent \
build-essential \
libblas-dev \
openjdk-17-jdk \
vim \
software-properties-common && \
Expand Down
66 changes: 27 additions & 39 deletions scripts/gatkcondaenv.yml.template
Original file line number Diff line number Diff line change
Expand Up @@ -15,53 +15,41 @@
name: $condaEnvName
channels:
# if channels other than conda-forge are added and the channel order is changed (note that conda channel_priority is currently set to flexible),
# verify that key dependencies are installed from the correct channel and compiled against MKL
# verify that key dependencies are installed from the correct channel
- conda-forge
- defaults

dependencies:

# core python dependencies
- conda-forge::python=3.6.10 # do not update
- conda-forge::pip=21.3.1
- conda-forge::mkl=2019.5 # MKL typically provides dramatic performance increases for theano, tensorflow, and other key dependencies
- conda-forge::mkl-service=2.3.0
- conda-forge::joblib=1.1.1 # must pin joblib - versions after 1.1.1 no longer support python 3.6
- conda-forge::numpy=1.17.5 # do not update, this will break scipy=1.0.0
# verify that numpy is compiled against MKL (e.g., by checking *_mkl_info using numpy.show_config())
# and that it is used in tensorflow, theano, and other key dependencies
- conda-forge::theano=1.0.4 # it is unlikely that new versions of theano will be released
# verify that this is using numpy compiled against MKL (e.g., by the presence of -lmkl_rt in theano.config.blas.ldflags)
- defaults::tensorflow=1.15.0 # update only if absolutely necessary, as this may cause conflicts with other core dependencies
# verify that this is using numpy compiled against MKL (e.g., by checking tensorflow.pywrap_tensorflow.IsMklEnabled())
- conda-forge::scipy=1.0.0 # do not update, this will break a scipy.misc.logsumexp import (deprecated in scipy=1.0.0) in pymc3=3.1
- conda-forge::pymc3=3.1 # do not update, this will break gcnvkernel
- conda-forge::h5py=2.10.0 # required by keras 2.2.4
- conda-forge::keras=2.2.4 # updated from pip-installed 2.2.0, which caused various conflicts/clobbers of conda-installed packages
# conda-installed 2.2.4 appears to be the most recent version with a consistent API and without conflicts/clobbers
# if you wish to update, note that versions of conda-forge::keras after 2.2.5
# undesirably set the environment variable KERAS_BACKEND = theano by default
- defaults::intel-openmp=2019.4
- conda-forge::scikit-learn=0.23.1
- conda-forge::matplotlib=3.2.1
- conda-forge::pandas=1.0.3
- conda-forge::typing_extensions=4.1.1 # see https://github.com/broadinstitute/gatk/issues/7800 and linked PRs
- conda-forge::dill=0.3.4 # used for pickling lambdas in TrainVariantAnnotationsModel
- conda-forge::python=3.10.13 # do not update without good reason
- conda-forge:pip=23.3.1
- conda-forge:blas=1.0=mkl # our official environment uses MKL versions of various packages; if other versions are desired, users should edit this YML accordingly
- conda-forge::numpy=1.26.2
- conda-forge::pymc=5.10.1
- conda-forge::pytensor=2.18.3
- conda-forge::scipy=1.11.4
- conda-forge::h5py=3.10.0
- conda-forge::pytorch=2.1.0=*mkl*100
- conda-forge::scikit-learn=1.3.2
- conda-forge::matplotlib=3.8.2
- conda-forge::pandas=2.1.3
- conda-forge::tqdm=4.66.1
- conda-forge::dill=0.3.7 # used for pickling lambdas in TrainVariantAnnotationsModel

# core R dependencies; these should only be used for plotting and do not take precedence over core python dependencies!
- r-base=3.6.2
- r-data.table=1.12.8
- r-dplyr=0.8.5
- r-getopt=1.20.3
- r-ggplot2=3.3.0
- r-gplots=3.0.3
- r-gsalib=2.1
- r-optparse=1.6.4
- r-backports=1.1.10
- r-base=4.3.1
- r-data.table=1.14.8
- r-dplyr=1.1.3
- r-getopt=1.20.4
- r-ggplot2=3.4.4
- r-gplots=3.1.3
- r-gsalib=2.2.1
- r-optparse=1.7.3
- r-backports=1.4.1

# other python dependencies; these should be removed after functionality is moved into Java code
- biopython=1.76
- pyvcf=0.6.8
- bioconda::pysam=0.15.3 # using older conda-installed versions may result in libcrypto / openssl bugs
- bioconda::pysam=0.22.0
- conda-forge::pyvcf=0.6.8

# pip installs should be avoided, as pip may not respect the dependencies found by the conda solver
- pip:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,12 +67,12 @@
* <p>OpenMP and MKL parallelism can be controlled by setting the <code>OMP_NUM_THREADS</code> and <code>MKL_NUM_THREADS</code>
* environment variables, respectively.</p>
*
* <p>Advanced users may wish to set the <code>THEANO_FLAGS</code> environment variable to override the GATK theano
* <p>Advanced users may wish to set the <code>PYTENSOR_FLAGS</code> environment variable to override the GATK PyTensor
* configuration. For example, by running
* <code>THEANO_FLAGS="base_compiledir=PATH/TO/BASE_COMPILEDIR" gatk DetermineGermlineContigPloidy ...</code>, users can specify
* the theano compilation directory (which is set to <code>$HOME/.theano</code> by default). See theano documentation
* at <a href="https://theano-pymc.readthedocs.io/en/latest/library/config.html">
* https://theano-pymc.readthedocs.io/en/latest/library/config.html</a>.
* <code>PYTENSOR_FLAGS="base_compiledir=PATH/TO/BASE_COMPILEDIR" gatk DetermineGermlineContigPloidy ...</code>, users can specify
* the PyTensor compilation directory (which is set to <code>$HOME/.pytensor</code> by default). See PyTensor documentation
* at <a href="https://pytensor.readthedocs.io/en/latest/library/config.html">
* https://pytensor.readthedocs.io/en/latest/library/config.html</a>.
* </p>
*
* <h3>Tool run modes</h3>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,12 +92,12 @@
* <p>OpenMP and MKL parallelism can be controlled by setting the <code>OMP_NUM_THREADS</code> and <code>MKL_NUM_THREADS</code>
* environment variables, respectively.</p>
*
* <p>Advanced users may wish to set the <code>THEANO_FLAGS</code> environment variable to override the GATK theano
* <p>Advanced users may wish to set the <code>PYTENSOR_FLAGS</code> environment variable to override the GATK PyTensor
* configuration. For example, by running
* <code>THEANO_FLAGS="base_compiledir=PATH/TO/BASE_COMPILEDIR" gatk GermlineCNVCaller ...</code>, users can specify
* the theano compilation directory (which is set to <code>$HOME/.theano</code> by default). See theano documentation
* at <a href="https://theano-pymc.readthedocs.io/en/latest/library/config.html">
* https://theano-pymc.readthedocs.io/en/latest/library/config.html</a>.
* <code>PYTENSOR_FLAGS="base_compiledir=PATH/TO/BASE_COMPILEDIR" gatk DetermineGermlineContigPloidy ...</code>, users can specify
* the PyTensor compilation directory (which is set to <code>$HOME/.pytensor</code> by default). See PyTensor documentation
* at <a href="https://pytensor.readthedocs.io/en/latest/library/config.html">
* https://pytensor.readthedocs.io/en/latest/library/config.html</a>.
* </p>
*
* <h3>Resource usage</h3>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,12 @@
* the python environment is already set up. Otherwise, the environment must be created and activated as described in the
* main GATK README.md file.</p>
*
* <p>Advanced users may wish to set the <code>THEANO_FLAGS</code> environment variable to override the GATK theano
* <p>Advanced users may wish to set the <code>PYTENSOR_FLAGS</code> environment variable to override the GATK PyTensor
* configuration. For example, by running
* <code>THEANO_FLAGS="base_compiledir=PATH/TO/BASE_COMPILEDIR" gatk PostprocessGermlineCNVCalls ...</code>, users can specify
* the theano compilation directory (which is set to <code>$HOME/.theano</code> by default). See theano documentation
* at <a href="https://theano-pymc.readthedocs.io/en/latest/library/config.html">
* https://theano-pymc.readthedocs.io/en/latest/library/config.html</a>.
* <code>PYTENSOR_FLAGS="base_compiledir=PATH/TO/BASE_COMPILEDIR" gatk DetermineGermlineContigPloidy ...</code>, users can specify
* the PyTensor compilation directory (which is set to <code>$HOME/.pytensor</code> by default). See PyTensor documentation
* at <a href="https://pytensor.readthedocs.io/en/latest/library/config.html">
* https://pytensor.readthedocs.io/en/latest/library/config.html</a>.
* </p>
*
* <h3>Required inputs:</h3>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -204,8 +204,7 @@ private void writeDenoisingPlots(final String sampleName,
//this runs the R statement "source("CNVPlottingLibrary.R")" before the main script runs
executor.addScript(new Resource(PlottingUtils.CNV_PLOTTING_R_LIBRARY, PlotDenoisedCopyRatios.class));
executor.addScript(new Resource(PLOT_DENOISED_COPY_RATIOS_R_SCRIPT, PlotDenoisedCopyRatios.class));
//--args is needed for Rscript to recognize other arguments properly
executor.addArgs("--args",
executor.addArgs(
"--sample_name=" + sampleName,
"--standardized_copy_ratios_file=" + CopyNumberArgumentValidationUtils.getCanonicalPath(inputStandardizedCopyRatiosFile),
"--denoised_copy_ratios_file=" + CopyNumberArgumentValidationUtils.getCanonicalPath(inputDenoisedCopyRatiosFile),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -285,8 +285,7 @@ private void writeModeledSegmentsPlot(final String sampleName,
//this runs the R statement "source("CNVPlottingLibrary.R")" before the main script runs
executor.addScript(new Resource(PlottingUtils.CNV_PLOTTING_R_LIBRARY, PlotModeledSegments.class));
executor.addScript(new Resource(PLOT_MODELED_SEGMENTS_R_SCRIPT, PlotModeledSegments.class));
//--args is needed for Rscript to recognize other arguments properly
executor.addArgs("--args",
executor.addArgs(
"--sample_name=" + sampleName,
"--denoised_copy_ratios_file=" + (inputDenoisedCopyRatiosFile == null ? null : CopyNumberArgumentValidationUtils.getCanonicalPath(inputDenoisedCopyRatiosFile)),
"--allelic_counts_file=" + (inputAllelicCountsFile == null ? null : CopyNumberArgumentValidationUtils.getCanonicalPath(inputAllelicCountsFile)),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@
* -weights path/to/my_model_folder/2dmodel.hd5
* </pre>
*/
@DeprecatedFeature
@DocumentedFeature
@CommandLineProgramProperties(
summary = CNNScoreVariants.USAGE_SUMMARY,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@
oneLineSummary = "Train a CNN model for filtering variants",
programGroup = VariantFilteringProgramGroup.class
)
@DeprecatedFeature
@DocumentedFeature
@ExperimentalFeature
public class CNNVariantTrain extends CommandLineProgram {

@Argument(fullName = "input-tensor-dir", shortName = "input-tensor-dir", doc = "Directory of training tensors to create.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@
oneLineSummary = "Write variant tensors for training a CNN to filter variants",
programGroup = VariantFilteringProgramGroup.class
)
@DeprecatedFeature
@DocumentedFeature
@ExperimentalFeature
public class CNNVariantWriteTensors extends CommandLineProgram {

@Argument(fullName = StandardArgumentDefinitions.REFERENCE_LONG_NAME,
Expand Down
6 changes: 6 additions & 0 deletions src/main/python/org/broadinstitute/hellbender/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,9 @@ can be installed as a standalone package, a corresponding `setup_<PACKAGE_NAME>.
file may be placed in this directory. However, during creation of the common
GATK conda environment, all packages will be combined and pip-installed as a
single package named ``gatkpythonpackages`` by `setup.py`.

However, note that it is easier to do development by installing live/editable versions of these packages
(i.e., running `pip install --editable .` in this directory), so that any code changes are immediately reflected in
the underlying environment. To do this, 1) remove the pip install of the `gatkpythonpackages.zip` archive in the
conda environment file, 2) create and activate the corresponding conda environment, then
3) run the editable pip install.
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ germline copy number variant (gCNV) tools and workflows.
This module implements inference schemes for read-depth profile denoising, germline
integer copy number variation discovery, germline contig ploidy determination, associated
I/O methods, and helper CLI scripts. `gcnvkernel` additionally provides general-purpose
inference schemas built on the top of `PyMC3` and `theano`.
inference schemas built on the top of `PyMC` and `pytensor`.

The module is organized as follows::

Expand All @@ -20,7 +20,7 @@ The module is organized as follows::
genomic intervals, read count data, global and sample-specific posteriors, and
sample metadata.

`gcnvkernel.models`: `PyMC3` model declarations, `theano` symbolic operations (e.g.
`gcnvkernel.models`: `PyMC` model declarations, `pytensor` symbolic operations (e.g.
forward-backward algorithm for HMMs), and custom probability distributions.

`gcnvkernel.preprocess`: Routines for filtering interval lists.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from pymc3 import __version__ as pymc3_version
from pymc import __version__ as pymc_version

from ._version import __version__
from .io import io_commons, io_consts, io_ploidy, io_denoising_calling, \
Expand Down Expand Up @@ -26,6 +26,6 @@
from .tasks.inference_task_base import ConvergenceError
from .utils import cli_commons, math

assert pymc3_version == "3.1", "gcnvkernel currently only supports PyMC3 3.1; version found: {0}; " \
"please upgrade or downgrade the PyMC3 module in your python environment " \
"accordingly.".format(pymc3_version)
assert pymc_version == "5.10.1", "gcnvkernel currently only supports PyMC 5.10.1; version found: {0}; " \
"please upgrade or downgrade the PyMC module in your python environment " \
"accordingly.".format(pymc_version)
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '0.8'
__version__ = '0.9'
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# let theano share memory workspace on large tensors with numpy
# let pytensor share memory workspace on large tensors with numpy
borrow_numpy = True

# if a normalized PMF violates total probability by the following threshold, it will
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import numpy as np
from pymc3.variational.callbacks import Callback
from pymc.variational.callbacks import Callback

from ..utils.rls import NonStationaryLinearRegression

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import pymc3 as pm
import pymc as pm
from .. import types

Operator = pm.operators.Operator
Expand All @@ -14,16 +14,15 @@ def __init__(self,
"""Initializer.
Args:
approx: an instance of PyMC3 approximation
temperature: a scalar shared theano tensor variable
approx: an instance of PyMC approximation
temperature: a scalar shared pytensor tensor variable
"""
super().__init__(approx)
assert temperature is not None
self.temperature = temperature

def apply(self, f):
z = self.input
return self.temperature * self.logq_norm(z) - self.logp_norm(z)
return (self.temperature * self.logq_norm - self.logp_norm)[0]


class ADVIDeterministicAnnealing(Inference):
Expand All @@ -32,24 +31,24 @@ class ADVIDeterministicAnnealing(Inference):
Note:
Temperature is not updated automatically by this class. This task is delegated to the ADVI step
function. This can be done by including a temperature update in `more_updates`; refer to
`pymc3.opvi.ObjectiveFunction.step_function` for more information.
`pymc.opvi.ObjectiveFunction.step_function` for more information.
"""
def __init__(self,
local_rv=None,
model=None,
cost_part_grad_scale=1,
scale_cost_to_minibatch=False,
random_seed=None, start=None,
random_seed=None,
start=None,
temperature=None):

assert temperature is not None, "Temperature (a scalar theano shared tensor) is not provided"
assert temperature is not None, "Temperature (a scalar pytensor shared tensor) is not provided"
approx = MeanField(local_rv=local_rv,
model=model,
cost_part_grad_scale=cost_part_grad_scale,
scale_cost_to_minibatch=scale_cost_to_minibatch,
random_seed=random_seed,
start=start)
super().__init__(
KLThermal, MeanField, None,
local_rv=local_rv,
model=model,
cost_part_grad_scale=cost_part_grad_scale,
scale_cost_to_minibatch=scale_cost_to_minibatch,
random_seed=random_seed,
start=start,
op_kwargs={'temperature': temperature})
KLThermal, approx, None, temperature=temperature)
Loading

0 comments on commit ddaf66f

Please sign in to comment.