Merge pull request #161 from klarman-cell-observatory/yiming

Prepare for v1.3.0 release
lilab-bcb · Feb 3, 2021 · 85f51d5 · 85f51d5
2 parents e2923ff + 4cee641
commit 85f51d5
Show file tree

Hide file tree

Showing 8 changed files with 298 additions and 36 deletions.
diff --git a/docker/cumulus/1.3.0/Dockerfile b/docker/cumulus/1.3.0/Dockerfile
@@ -0,0 +1,73 @@
+FROM debian:buster-slim
+SHELL ["/bin/bash", "-c"]
+
+RUN mkdir -p /usr/share/man/man1 && \
+    apt-get -qq update && \
+    apt-get -qq -y install --no-install-recommends \
+        build-essential \
+        gnupg \
+        libfftw3-dev \
+        default-jdk \
+        curl \
+        python3 \
+        python3-dev \
+        python3-pip
+
+RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \
+    curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg  add - && \
+    apt-get update -y && apt-get install -y google-cloud-sdk=326.0.0-0
+
+RUN ln -s /usr/bin/python3 /usr/bin/python
+
+RUN python -m pip install --upgrade pip --no-cache-dir && \
+    python -m pip install setuptools==53.0.0 --no-cache-dir && \
+    python -m pip install numpy==1.19.5 --no-cache-dir && \
+    python -m pip install pandas==1.2.1 --no-cache-dir && \
+    python -m pip install scipy==1.5.4 --no-cache-dir && \
+    python -m pip install Cython==0.29.21 --no-cache-dir && \
+    python -m pip install pybind11==2.6.2 --no-cache-dir && \
+    python -m pip install scikit-image==0.18.1 --no-cache-dir && \
+    python -m pip install scikit-learn==0.24.1 --no-cache-dir && \
+    python -m pip install h5py==3.1.0 --no-cache-dir && \
+    python -m pip install fitsne==1.1.1 --no-cache-dir && \
+    python -m pip install importlib-metadata==3.4.0 --no-cache-dir && \
+    python -m pip install joblib==1.0.0 --no-cache-dir && \
+    python -m pip install psutil==5.8.0 --no-cache-dir && \
+    python -m pip install threadpoolctl==2.1.0 --no-cache-dir && \
+    python -m pip install python-igraph==0.8.3 --no-cache-dir && \
+    python -m pip install leidenalg==0.8.3 --no-cache-dir && \
+    python -m pip install lightgbm==3.1.1 --no-cache-dir && \
+    python -m pip install loompy==3.0.6 --no-cache-dir && \
+    python -m pip install matplotlib==3.3.4 --no-cache-dir && \
+    python -m pip install natsort==7.1.1 --no-cache-dir && \
+    python -m pip install numba==0.52.0 --no-cache-dir && \
+    python -m pip install scanorama==1.7 --no-cache-dir && \
+    python -m pip install scikit-misc==0.1.3 --no-cache-dir && \
+    python -m pip install seaborn==0.11.1 --no-cache-dir && \
+    python -m pip install statsmodels==0.12.2 --no-cache-dir && \
+    python -m pip install numcodecs==0.7.3 --no-cache-dir && \
+    python -m pip install networkx==2.5 --no-cache-dir && \
+    python -m pip install zarr==2.6.1 --no-cache-dir && \
+    python -m pip install anndata==0.7.5 --no-cache-dir && \
+    python -m pip install hnswlib==0.5.0 --no-cache-dir && \
+    python -m pip install louvain==0.7.0 --no-cache-dir && \
+    python -m pip install umap-learn==0.4.6 --no-cache-dir && \
+    python -m pip install torch==1.7.1 --no-cache-dir && \
+    python -m pip install harmony-pytorch==0.1.6 --no-cache-dir && \
+    python -m pip install cirrocumulus==1.1.13.post1 --no-cache-dir && \
+    python -m pip install annoy==1.17.0 --no-cache-dir && \
+    python -m pip install pegasusio==0.2.10 --no-cache-dir && \
+    python -m pip install demuxEM==0.1.5.post1 --no-cache-dir && \
+    python -m pip install forceatlas2-python==1.1 --no-cache-dir && \
+    python -m pip install pegasuspy==1.3.0 --no-cache-dir
+
+RUN apt-get -qq -y remove curl gnupg && \
+    apt-get -qq -y autoremove && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /var/log/dpkg.log
+
+RUN mkdir /software
+ADD https://raw.githubusercontent.com/klarman-cell-observatory/cumulus/master/docker/monitor_script.sh /software
+RUN chmod a+rx /software/monitor_script.sh
+
+ENV PATH=/software:$PATH
diff --git a/docker/pegasus-terra/1.3/Dockerfile b/docker/pegasus-terra/1.3/Dockerfile
@@ -0,0 +1,69 @@
+FROM us.gcr.io/broad-dsp-gcr-public/terra-jupyter-base:0.0.19
+USER root
+#this makes it so pip runs as root, not the user
+ENV PIP_USER=false
+
+RUN apt-get update && apt-get install -yq --no-install-recommends \
+        build-essential \
+        python3-dev \
+        libfftw3-dev && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN pip3 -V && \
+    pip3 install --upgrade pip && \
+    pip3 install setuptools==53.0.0 && \
+    pip3 install numpy==1.19.5 && \
+    pip3 install pandas==1.2.1 && \
+    pip3 install scipy==1.5.4 && \
+    pip3 install Cython==0.29.21 && \
+    pip3 install pybind11==2.6.1 && \
+    pip3 install scikit-image==0.18.1 && \
+    pip3 install scikit-learn==0.24.1 && \
+    pip3 install h5py==3.1.0 && \
+    pip3 install fitsne==1.1.1 && \
+    pip3 install importlib-metadata==3.4.0 && \
+    pip3 install joblib==1.0.0 && \
+    pip3 install psutil==5.8.0 && \
+    pip3 install threadpoolctl==2.1.0 && \
+    pip3 install python-igraph==0.8.3 && \
+    pip3 install leidenalg==0.8.3 && \
+    pip3 install lightgbm==3.1.1 && \
+    pip3 install loompy==3.0.6 && \
+    pip3 install matplotlib==3.3.4 && \
+    pip3 install natsort==7.1.1 && \
+    pip3 install numba==0.52.0 && \
+    pip3 install scanorama==1.7 && \
+    pip3 install scikit-misc==0.1.3 && \
+    pip3 install seaborn==0.11.1 && \
+    pip3 install statsmodels==0.12.2 && \
+    pip3 install numcodecs==0.7.3 && \
+    pip3 install networkx==2.5 && \
+    pip3 install zarr==2.6.1 && \
+    pip3 install anndata==0.7.5 && \
+    pip3 install hnswlib==0.5.0 && \
+    pip3 install louvain==0.7.0 && \
+    pip3 install umap-learn==0.4.6 && \
+    pip3 install torch==1.7.1 && \
+    pip3 install harmony-pytorch==0.1.6 && \
+    pip3 install cirrocumulus==1.1.13.post1 && \
+    pip3 install annoy==1.17.0 && \
+    pip3 install pegasusio==0.2.10 && \
+    pip3 install demuxEM==0.1.5.post1 && \
+    pip3 install forceatlas2-python==1.1 && \
+    pip3 install pegasuspy==1.3.0
+
+RUN wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip && \
+    unzip ngrok-stable-linux-amd64.zip && \
+    rm ngrok-stable-linux-amd64.zip && \
+    mkdir -p /software && \
+    mv ngrok /software/
+
+ENV PATH=/software:$PATH
+
+ENV USER jupyter-user
+USER $USER
+#we want pip to install into the user's dir when the notebook is running
+ENV PIP_USER=true
+
+ENTRYPOINT ["/usr/local/bin/jupyter", "notebook"]
diff --git a/docker/pegasus-terra/1.3/pegasus-terra-1.3-versions.json b/docker/pegasus-terra/1.3/pegasus-terra-1.3-versions.json
@@ -0,0 +1,68 @@
+{
+    "python": {
+        "adjustText": "0.7.3",
+        "anndata": "0.7.5",
+        "annoy": "1.17.0",
+        "cirrocumulus": "1.1.13.post1",
+        "Cython": "0.29.21",
+        "demuxEM": "0.1.5.post1",
+        "firecloud": "0.16.25",
+        "fitsne": "1.1.1",
+        "forceatlas2-python": "1.1",
+        "gprofiler-official": "1.0.0",
+        "h5py": "3.1.0",
+        "harmony-pytorch": "0.1.6",
+        "hnswlib": "0.5.0",
+        "importlib-metadata": "3.4.0",
+        "joblib": "1.0.0",
+        "jupyter": "1.0.0",
+        "jupyter-client": "6.1.11",
+        "jupyter-console": "6.2.0",
+        "jupyter-contrib-core": "0.3.3",
+        "jupyter-contrib-nbextensions": "0.5.1",
+        "jupyter-core": "4.7.0",
+        "jupyter-highlight-selected-word": "0.2.0",
+        "jupyter-latex-envs": "1.4.6",
+        "jupyter-nbextensions-configurator": "0.4.1",
+        "jupyterlab": "0.35.4",
+        "jupyterlab-pygments": "0.1.2",
+        "jupyterlab-server": "0.2.0",
+        "leidenalg": "0.8.3",
+        "lightgbm": "3.1.1",
+        "loompy": "3.0.6",
+        "louvain": "0.7.0",
+        "matplotlib": "3.3.4",
+        "natsort": "7.1.1",
+        "nbclient": "0.5.1",
+        "nbconvert": "6.0.7",
+        "nbformat": "5.1.2",
+        "networkx": "2.5",
+        "notebook": "6.1.1",
+        "numba": "0.52.0",
+        "numpy": "1.19.5",
+        "pandas": "1.2.1",
+        "pegasusio": "0.2.10",
+        "pegasuspy": "1.3.0",
+        "Pillow": "8.1.0",
+        "pip": "21.0.1",
+        "pyarrow": "3.0.0",
+        "pybind11": "2.6.1",
+        "python-igraph": "0.8.3",
+        "scanorama": "1.7",
+        "scikit-image": "0.18.1",
+        "scikit-learn": "0.24.1",
+        "scikit-misc": "0.1.3",
+        "scipy": "1.5.4",
+        "seaborn": "0.11.1",
+        "setuptools": "53.0.0",
+        "statsmodels": "0.12.2",
+        "torch": "1.7.1",
+        "umap-learn": "0.4.6",
+        "XlsxWriter": "1.3.7",
+        "zarr": "2.6.1"
+    },
+    "tools": {
+        "google-cloud-sdk": "324.0.0",
+        "ngrok": "2.3.35"
+    }
+}
diff --git a/docs/cumulus.rst b/docs/cumulus.rst
@@ -121,11 +121,10 @@ Cumulus steps:
 #. **plot**. This step is optional. In this step, **Cumulus** can generate 6 types of figures based on the **cluster** step results:
 
     - **composition** plots which are bar plots showing the cell compositions (from different conditions) for each cluster. This type of plots is useful to fast assess library quality and batch effects.
-    - **tsne**, **fitsne**, and **net_tsne**: t-SNE like plots based on different algorithms, respectively. Users can specify cell attributes (e.g. cluster labels, conditions) for coloring side-by-side.
     - **umap** and **net_umap**: UMAP like plots based on different algorithms, respectively. Users can specify cell attributes (e.g. cluster labels, conditions) for coloring side-by-side.
+    - **tsne**: FIt-SNE plots. Users can specify cell attributes (e.g. cluster labels, conditions) for coloring side-by-side.
     - **fle** and **net_fle**: FLE (Force-directed Layout Embedding) like plots based on different algorithms, respectively. Users can specify cell attributes (e.g. cluster labels, conditions) for coloring side-by-side.
-    - **diffmap** plots which are 3D interactive plots showing the diffusion maps. The 3 coordinates are the first 3 PCs of all diffusion components.
-    - If input is CITE-Seq data, there will be **citeseq_fitsne** plots which are FIt-SNE plots based on epitope expression.
+    - If input is CITE-Seq data, there will be **citeseq_umap** plots which are UMAP plots based on epitope expression.
 
 #. **cirro_output**. This step is optional. Generate `Cirrocumulus`_ inputs for visualization using `Cirrocumulus`_ .
 
@@ -160,10 +159,10 @@ global inputs
       - This is the name of subdirectory for the current sample; and all output files within the subdirectory will have this string as the common filename prefix.
       - "my_sample"
       -
-    * - cumulus_version
-      - cumulus version to use. Versions available: 1.1.0, 1.0.0, 0.16.0, 0.15.0, 0.13.0, 0.12.0, 0.11.0, 0.10.0.
-      - "1.1.0"
-      - "1.1.0"
+    * - pegasus_version
+      - Pegasus version to use for analysis. Versions available: ``1.3.0``.
+      - "1.3.0"
+      - "1.3.0"
     * - docker_registry
       - Docker registry to use. Options:
 
@@ -599,6 +598,7 @@ cluster outputs
         | To load this file in Python, you need to first install `PegasusIO`_ on your local machine. Then use ``import pegasusio as io; data = io.read_input('output_name.zarr.zip')`` in Python environment.
         | ``data`` is a *MultimodalData* object, and points to its default *UnimodalData* element. You can set its default *UnimodalData* to others by ``data.set_data(focus_key)`` where ``focus_key`` is the key string to the wanted *UnimodalData* element.
         | For its default *UnimodalData* element, the log-normalized expression matrix is stored in ``data.X`` as a Scipy CSR-format sparse matrix, with cell-by-gene shape.
+        | Alternatively, to get the raw count matrix, first run ``data.select_matrix('raw.X')``, then ``data.X`` will be switched to point to the raw matrix.
         | The ``obs`` field contains cell related attributes, including clustering results.
         | For example, ``data.obs_names`` records cell barcodes; ``data.obs['Channel']`` records the channel each cell comes from;
         | ``data.obs['n_genes']``, ``data.obs['n_counts']``, and ``data.obs['percent_mito']`` record the number of expressed genes, total UMI count, and mitochondrial rate for each cell respectively;
@@ -608,8 +608,7 @@ cluster outputs
         | The ``obsm`` field records embedding coordinates.
         | For example, ``data.obsm['X_pca']`` records PCA coordinates, ``data.obsm['X_tsne']`` records t-SNE coordinates,
         | ``data.obsm['X_umap']`` records UMAP coordinates, ``data.obsm['X_diffmap']`` records diffusion map coordinates,
-        | ``data.obsm['X_diffmap_pca']`` records the first 3 PCs by projecting the diffusion components using PCA,
-        | and ``data.obsm['X_fle']`` records the force-directed layout coordinates from the diffusion components.
+        | and ``data.obsm['X_fle']`` records the force-directed layout coordinates.
         | The ``uns`` field stores other related information, such as reference genome (``data.uns['genome']``), kNN on PCA coordinates (``data.uns['pca_knn_indices']`` and ``data.uns['pca_knn_distances']``), etc.
     * - **output_log**
       - File
@@ -619,8 +618,7 @@ cluster outputs
       - | List of output file(s) in Seurat-compatible h5ad format (output_name.focus_key.h5ad), in which each file is associated with a focus of the input data.
         | To load this file in Python, first install `PegasusIO`_ on your local machine. Then use ``import pegasusio as io; data = io.read_input('output_name.focus_key.h5ad')`` in Python environment.
         | After loading, ``data`` has the similar structure as *UnimodalData* object in Description of **output_zarr** in `cluster outputs <./cumulus.html#cluster-outputs>`_ section.
-        | In addition, ``data.raw.X`` records filtered raw count matrix as a Scipy CSR-format sparse matrix, with cell-by-gene shape.
-        | ``data.uns['scale.data']`` records variable-gene-selected and standardized expression matrix which are ready to perform PCA, and ``data.uns['scale.data.rownames']`` records indexes of the selected highly variable genes.
+        | In addition, ``data.uns['scale.data']`` records variable-gene-selected and standardized expression matrix which are ready to perform PCA, and ``data.uns['scale.data.rownames']`` records indexes of the selected highly variable genes.
         | This file is used for loading in R and converting into a Seurat object (see `here <./cumulus.html#load-h5ad-file-into-seurat>`_ for instructions)
     * - output_filt_xlsx
       - File
@@ -660,8 +658,7 @@ cluster outputs
         | ``ds.ca['louvain_labels']``, ``ds.ca['leiden_labels']``, ``ds.ca['spectral_louvain_labels']``, and ``ds.ca['spectral_leiden_labels']`` record each cell's cluster labels using different clustering algorithms;
         | ``ds.ca['X_pca']`` records PCA coordinates, ``ds.ca['X_tsne']`` records t-SNE coordinates,
         | ``ds.ca['X_umap']`` records UMAP coordinates, ``ds.ca['X_diffmap']`` records diffusion map coordinates,
-        | ``ds.ca['X_diffmap_pca']`` records the first 3 PCs by projecting the diffusion components using PCA,
-        | and ``ds.ca['X_fle']`` records the force-directed layout coordinates from the diffusion components.
+        | and ``ds.ca['X_fle']`` records the force-directed layout coordinates.
         | The ``ra`` field contains gene related attributes as column attributes.
         | For example, ``ds.ra['var_names']`` records gene symbols, ``ds.ra['gene_ids']`` records Ensembl gene IDs, and ``ds.ra['highly_variable_features']`` records selected variable genes.
 
@@ -748,8 +745,8 @@ de_analysis outputs
       - | List of h5ad-formatted results with DE results updated (output_name.focus_key.h5ad), in which each file is associated with a focus of the input data.
         | To load this file in Python, you need to first install `PegasusIO`_ on your local machine. Then type ``import pegasusio as io; data = io.read_input('output_name.focus_key.h5ad')`` in Python environment.
         | After loading, ``data`` has the similar structure as *UnimodalData* object in Description of **output_zarr** in `cluster outputs <./cumulus.html#cluster-outputs>`_ section.
-        | Besides, there is one additional field ``varm`` which records DE analysis results in ``data.varm['de_res']``. You can use Pandas DataFrame to convert it into a reader-friendly structure: ``import pandas as pd; df = pd.DataFrame(data.varm['de_res'], index = data.var_names)``. Then in the resulting data frame, genes are rows, and those DE test statistics are columns.
-        | DE analysis in cumulus is performed on each cluster against cells in all the other clusters. For instance, in the data frame, column ``mean_logExpr:1`` refers to the mean expression of genes in log-scale for cells in Cluster 1. The number after colon refers to the cluster label to which this statistic belongs.
+        | Besides, there is one additional field ``varm`` which records DE analysis results in ``data.varm['de_res']``. You can use Pandas DataFrame to convert it into a reader-friendly structure: ``import pandas as pd; df = pd.DataFrame(data.varm['de_res'], index=data.var_names)``. Then in the resulting data frame, genes are rows, and those DE test statistics are columns.
+        | DE analysis in cumulus is performed on each cluster against cells in all the other clusters. For instance, in the data frame, column ``1:log2Mean`` refers to the mean expression of genes in log-scale for cells in Cluster 1. The number before colon refers to the cluster label to which this statistic belongs.
     * - output_de_xlsx
       - Array[File]
       - | List of spreadsheets reporting DE results (output_name.focus_key.de.xlsx), in which each file is associated with a focus of the input data.

diff --git a/docs/release_notes.rst b/docs/release_notes.rst
@@ -1,3 +1,10 @@
+Version 1.3.0 `February 2, 2021`
+--------------------------------
+
+* On *cumulus* workflow:
+    * Change ``cumulus_version`` to ``pegasus_version`` to avoid confusion.
+    * Update to use Pegasus v1.3.0 for analysis.
+
 Version 1.2.0 `January 19, 2021`
 --------------------------------