diff --git a/modules/merge-sce/main.nf b/modules/merge-sce/main.nf index 048c781..4d33444 100644 --- a/modules/merge-sce/main.nf +++ b/modules/merge-sce/main.nf @@ -98,9 +98,9 @@ process export_anndata { --is_merged \ # move normalized counts to X in AnnData - move_counts_anndata.py --anndata_file ${rna_h5ad_file} + reformat_anndata.py --anndata_file ${rna_h5ad_file} --hvg_name "merged_highly_variable_genes" if [ -f ${feature_h5ad_file} ]; then - move_counts_anndata.py --anndata_file ${feature_h5ad_file} + reformat_anndata.py --anndata_file ${feature_h5ad_file} --hvg_name "none" fi """ stub: diff --git a/modules/merge-sce/readme.md b/modules/merge-sce/readme.md index 8b05cb6..df92e98 100644 --- a/modules/merge-sce/readme.md +++ b/modules/merge-sce/readme.md @@ -3,12 +3,11 @@ This workflow creates merged SCE files and associated AnnData files from a project's individual SCE files. The workflow and scripts are largely ported from the [`scpca-nf` workflow](https://github.com/AlexsLemonade/scpca-nf). -The [a9dc826 commit](https://github.com/AlexsLemonade/scpca-nf/tree/a9dc826b8576c48ca38c6ca137f9eeced29c3acc) was used as the base for the port. +The [519f3bd commit](https://github.com/AlexsLemonade/scpca-nf/commit/519f3bdb5f9d2d33bb6829dddd537b67052cc29d) was used as the base for the port. There have been some small changes, in particular: -- Containers have been updated to use a more recent version of `scpcaTools`, with underlying updates to Bioconductor and Python packages. - The workflow is run on a project level, and all SCE files within a project are merged. - the assumption in this workflow is that all libraries within a project are to be merged, so there (currently) are no options to specify which specific samples to merge. - - The `--include_adt` flag was also removed from the `merge_sces.R` script, replaced by looking directly at the SCE objects - - The `sce_to_anndata.R` script now only warns if the requested feature altExp is not found in the SCE file, rather than erroring out. +- The `--include_adt` flag was also removed from the `merge_sces.R` script, replaced by looking directly at the SCE objects +- The `sce_to_anndata.R` script now only warns if the requested feature altExp is not found in the SCE file, rather than erroring out. diff --git a/modules/merge-sce/resources/usr/bin/move_counts_anndata.py b/modules/merge-sce/resources/usr/bin/move_counts_anndata.py deleted file mode 100755 index 75e0412..0000000 --- a/modules/merge-sce/resources/usr/bin/move_counts_anndata.py +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 - -# This script takes an AnnData object and checks for the `logcounts` -# in layers. If present, `logcounts` is moved to `X` and `X` (which has the raw counts) -# is moved to `raw.X` - -import argparse -import os -import re - -import anndata as adata - -parser = argparse.ArgumentParser() -parser.add_argument( - "-i", - "--anndata_file", - dest="anndata_file", - required=True, - help="Path to HDF5 file with processed AnnData object", -) -parser.add_argument( - "-u", - "--uncompressed", - dest="compress", - action="store_false", - help="Output an uncompressed HDF5 file", -) - -args = parser.parse_args() - -# compile extension regex -file_ext = re.compile(r"\.hdf5$|\.h5$|\.h5ad$", re.IGNORECASE) - -# check that input file exists, if it does exist, make sure it's an h5 file -if not os.path.exists(args.anndata_file): - raise FileExistsError("`input_anndata` does not exist.") -elif not file_ext.search(args.anndata_file): - raise ValueError( - "--input_anndata must end in either .hdf5, .h5, or .h5ad, and contain a processed AnnData object." - ) - -# read in anndata -object = adata.read_h5ad(args.anndata_file) - -# if logcounts is present -if "logcounts" in object.layers: - # move X to raw.X by creating the raw object - object.raw = object - # move logcounts to X and rename - object.X = object.layers["logcounts"] - object.uns["X_name"] = "logcounts" - del object.layers["logcounts"] - - # export object - object.write_h5ad(args.anndata_file, compression="gzip" if args.compress else None) diff --git a/modules/merge-sce/resources/usr/bin/reformat_anndata.py b/modules/merge-sce/resources/usr/bin/reformat_anndata.py new file mode 100755 index 0000000..f64c113 --- /dev/null +++ b/modules/merge-sce/resources/usr/bin/reformat_anndata.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +""" +This script takes an AnnData object and checks for the `logcounts` in layers. +If present, `logcounts` is moved to `X` and `X` (which has the raw counts) is moved to `raw.X` + +In addition, any DataFrames in `obsm` are converted to ndarrays, highly variable genes are converted to a `var` column. +If a `pca_meta_file` is provided, PCA variance statistics and standard creation values are in the format expected by scanpy. +""" + +import argparse +import os +import re + +import anndata +import pandas as pd + +parser = argparse.ArgumentParser() +parser.add_argument( + "-i", + "--anndata_file", + dest="anndata_file", + required=True, + help="Path to HDF5 file with processed AnnData object", +) +parser.add_argument( + "--pca_meta_file", + dest="pca_meta_file", + required=False, + help="Path to file with a table of variance explained by each PCA component", +) +parser.add_argument( + "--pca_not_centered", + dest="pca_centered", + action="store_false", + help="Indicate that the PCA table is not zero-centered", +) +parser.add_argument( + "--hvg_name", + dest="hvg_name", + default="highly_variable_genes", + help=( + "Indicate the name used to store highly variable genes associated with the PCA in the exported AnnData object." + "Use the value 'none' if no highly variable genes were used." + ), +) +parser.add_argument( + "-u", + "--uncompressed", + dest="compress", + action="store_false", + help="Output an uncompressed HDF5 file", +) + +args = parser.parse_args() + +# compile extension regex +file_ext = re.compile(r"\.(hdf5|h5|h5ad)$", re.IGNORECASE) + +# check that input file exists, if it does exist, make sure it's an h5 file +if not os.path.exists(args.anndata_file): + raise FileExistsError("`input_anndata` does not exist.") +elif not file_ext.search(args.anndata_file): + raise ValueError( + "input_anndata must end in either .hdf5, .h5, or .h5ad, and contain a processed AnnData adata." + ) + +# if there is a pca_meta_file, check that it exists and has a tsv extension +if args.pca_meta_file: + if not os.path.exists(args.pca_meta_file): + raise FileExistsError("`pca_meta_file` does not exist.") + elif not args.pca_meta_file.casefold().endswith(".tsv"): + raise ValueError("`pca_meta_file` must end in .tsv.") + +# read in anndata +adata = anndata.read_h5ad(args.anndata_file) + +# if logcounts is present +if "logcounts" in adata.layers: + # move X to raw.X by creating the raw adata + adata.raw = adata + # move logcounts to X and rename + adata.X = adata.layers["logcounts"] + adata.uns["X_name"] = "logcounts" + del adata.layers["logcounts"] + +# convert DataFrames in obsm to ndarrays +for key, value in adata.obsm.items(): + if isinstance(value, pd.DataFrame): + adata.obsm[key] = value.to_numpy() + +# convert highly variable genes to a column if given +use_hvg = args.hvg_name.casefold() != "none" +if use_hvg: + if args.hvg_name not in adata.uns.keys(): + raise ValueError("`hvg_name` must be present in the `uns` data for the object") + adata.var["highly_variable"] = adata.var.gene_ids.isin(adata.uns[args.hvg_name]) + + +# add pca adata to uns if pca_meta_file is provided in the format created by scanpy +if args.pca_meta_file: + pca_meta = pd.read_csv(args.pca_meta_file, sep="\t", index_col=0) + if "variance" not in pca_meta.columns or "variance_ratio" not in pca_meta.columns: + raise ValueError( + "`pca_meta_file` must contain columns `variance` and `variance_ratio`" + ) + pca_object = { + "param": { + "zero_center": args.pca_centered, + "use_highly_variable": use_hvg, + "mask_var": ("highly_variable" if use_hvg else None), + }, + "variance": pca_meta["variance"].to_numpy(), + "variance_ratio": pca_meta["variance_ratio"].to_numpy(), + } + adata.uns["pca"] = pca_object + +# export adata +adata.write_h5ad(args.anndata_file, compression="gzip" if args.compress else None) diff --git a/modules/merge-sce/resources/usr/bin/sce_to_anndata.R b/modules/merge-sce/resources/usr/bin/sce_to_anndata.R index a3f672e..de2b988 100755 --- a/modules/merge-sce/resources/usr/bin/sce_to_anndata.R +++ b/modules/merge-sce/resources/usr/bin/sce_to_anndata.R @@ -23,6 +23,12 @@ option_list <- list( type = "character", help = "path to output hdf5 file to store RNA counts as AnnData object. Must end in .hdf5, .h5ad, or .h5" ), + make_option( + opt_str = c("--output_pca_tsv"), + default = NULL, + type = "character", + help = "path to output a table of variance explained by each principal component. Must end in .tsv" + ), make_option( opt_str = c("--feature_name"), type = "character", @@ -62,12 +68,17 @@ if (!(stringr::str_ends(opt$output_rna_h5, ".hdf5|.h5|.h5ad"))) { stop("output rna file name must end in .hdf5, .h5, or .h5ad") } +# check that the pca file is a tsv +if (!is.null(opt$output_pca_tsv) && !stringr::str_ends(opt$output_pca_tsv, ".tsv")) { + stop("output pca file name must end in .tsv") +} + # Merged object function ------------------------------------------------------ # this function updates merged object formatting for anndata export format_merged_sce <- function(sce) { # paste X to any present reduced dim names - reducedDimNames(sce) <- glue::glue("X_{reducedDimNames(sce)}") + reducedDimNames(sce) <- glue::glue("X_{tolower(reducedDimNames(sce))}") return(sce) } @@ -120,8 +131,8 @@ format_czi <- function(sce) { # so everything gets set to false rowData(sce)$feature_is_filtered <- FALSE - # paste X to any present reduced dim names - reducedDimNames(sce) <- glue::glue("X_{reducedDimNames(sce)}") + # paste X to any present reduced dim names, converting to lower case + reducedDimNames(sce) <- glue::glue("X_{tolower(reducedDimNames(sce))}") return(sce) } @@ -161,6 +172,18 @@ scpcaTools::sce_to_anndata( compression = ifelse(opt$compress_output, "gzip", "none") ) +# Get PCA metadata for AnnData +if (!is.null(opt$output_pca_tsv) && "X_pca" %in% reducedDimNames(sce)) { + pca_meta_df <- data.frame( + PC = 1:ncol(reducedDims(sce)$X_pca), + variance = attr(reducedDims(sce)$X_pca, "varExplained"), + variance_ratio = attr(reducedDims(sce)$X_pca, "percentVar") / 100 + ) + + # write pca to tsv + readr::write_tsv(pca_meta_df, opt$output_pca_tsv) +} + message("Exported RNA") # AltExp to AnnData -----------------------------------------------------------