fix scripts

openproblems-bio · Aug 29, 2024 · 163f821 · 163f821
1 parent 497ed55
commit 163f821
Show file tree

Hide file tree

Showing 5 changed files with 25 additions and 23 deletions.
diff --git a/scripts/create_test_resources.sh b/scripts/create_test_resources.sh
@@ -77,26 +77,27 @@ viash run src/data_processors/crop_region/config.vsh.yaml -- \
   --min_y 10000 \
   --max_y 12000
 
+aws s3 sync --profile op \
+  "resources_test/common/2023_10x_mouse_brain_xenium" \
+  "s3://openproblems-data/resources_test/common/2023_10x_mouse_brain_xenium" \
+  --delete --dryrun
 
 ###################################################################################
-DATASET_ID="allen_brain_cell_atlas/2023_Yao_mouse_brain_scRNAseq_10Xv2"
+DATASET_ID="allen_brain_cell_atlas/2023_yao_mouse_brain_scrnaseq_10xv2"
 TMP_DIR="temp/datasets/$DATASET_ID"
-OUT_DIR="resources_test/common/2023_abca_Yao_mouse_brain_scRNAseq_10Xv2"
-
+OUT_DIR="resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2"
 
 # generate sc reference
 VIASH_TEMP=/tmp/allen_brain_cell_atlas \
-  viash run src/data_loaders/download_allen_brain_cell_atlas/config.vsh.yaml -- \
-  --output "$TMP_DIR/tmp_sc_reference.h5ad" --regions "OLF;TH"
+  viash run src/data_loaders/download_allen_brain_cell_atlas/config.vsh.yaml \
+  --keep true -- \
+  --output "$TMP_DIR/tmp_dataset.h5ad" --regions "OLF;TH"
 
 viash run src/data_processors/subset_reference/config.vsh.yaml -- \
-  --input "$TMP_DIR/tmp_sc_reference.h5ad" \
-  --output "$OUT_DIR/sc_reference.h5ad"
+  --input "$TMP_DIR/tmp_dataset.h5ad" \
+  --output "$OUT_DIR/dataset.h5ad"
 
-
-
-###################################################################################
 aws s3 sync --profile op \
-  "resources_test/common/2023_10x_mouse_brain_xenium" \
-  "s3://openproblems-data/resources_test/common/2023_10x_mouse_brain_xenium" \
+  "resources_test/common/2023_yao_mouse_brain_scrnaseq_10xv2" \
+  "s3://openproblems-data/resources_test/common/2023_abca_Yao_mouse_brain_scRNAseq_10Xv2" \
   --delete --dryrun
diff --git a/src/data_loaders/download_allen_brain_cell_atlas/config.vsh.yaml b/src/data_loaders/download_allen_brain_cell_atlas/config.vsh.yaml
@@ -6,7 +6,7 @@ argument_groups:
   - name: Inputs
     arguments:
       - type: string
-        name: --version
+        name: --abca_version
         required: false
         default: "20230630"
         description: The version of the Allen Brain Cell Atlas to download data from.

diff --git a/src/data_loaders/download_allen_brain_cell_atlas/script.py b/src/data_loaders/download_allen_brain_cell_atlas/script.py
@@ -7,17 +7,17 @@
 
 ## VIASH START
 par = {
-    "version": "20230630",
+    "abca_version": "20230630",
     "regions": ["OLF", "TH"],
-    "output": "temp/datasets/allen_brain_cell_atlas/2023_Yao_mouse_brain_scRNAseq_10Xv2/dataset.h5ad",
+    "output": "tmp_dataset.h5ad",
 }
 meta = {
     "temp_dir": "/tmp/allen_brain_cell_atlas",
 }
 ## VIASH END
 
 # helper variables
-VERSION = par["version"]
+VERSION = par["abca_version"]
 REGIONS = par["regions"]
 TMP_DIR = Path(meta["temp_dir"] or "/tmp")
 
@@ -116,8 +116,8 @@
 adata.var = adata.var.rename(columns={"gene_symbol":"feature_name"})
 
 # Uns
-adata.uns["dataset_id"] = "2023_Yao_mouse_brain_scRNAseq_10Xv2"
-adata.uns["dataset_name"] = "2023_Yao_mouse_brain_scRNAseq_10Xv2"
+adata.uns["dataset_id"] = "allen_brain_cell_atlas/2023_yao_mouse_brain_scrnaseq_10xv2"
+adata.uns["dataset_name"] = "ABCA Mouse Brain scRNAseq"
 adata.uns["dataset_url"] = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE246717"
 adata.uns["dataset_reference"] = "10.1038/s41586-023-06812-z"
 adata.uns["dataset_summary"] = "A high-resolution scRNAseq atlas of cell types in the whole mouse brain"

diff --git a/src/data_processors/subset_reference/config.vsh.yaml b/src/data_processors/subset_reference/config.vsh.yaml
@@ -13,7 +13,7 @@ arguments:
     direction: output
   - type: integer
     name: --n_cells
-    required: true
+    required: false
     description: The number of cells to sample.
     default: 500
   - type: integer
@@ -23,7 +23,7 @@ arguments:
     default: 50
   - type: string
     name: --cell_type_key
-    required: true
+    required: false
     description: The key in the obs dataframe that contains the cell type information.
     default: cell_type
   - name: "--keep_cell_type_categories"

diff --git a/src/data_processors/subset_reference/script.py b/src/data_processors/subset_reference/script.py
@@ -4,8 +4,8 @@
 
 ### VIASH START
 par = {
-    "input": "path/to/input.h5ad",
-    "output": "path/to/output.h5ad",
+    "input": "temp/datasets/allen_brain_cell_atlas/2023_Yao_mouse_brain_scRNAseq_10Xv2/tmp_dataset.h5ad",
+    "output": "resources_test/common/2023_abca_Yao_mouse_brain_scRNAseq_10Xv2/dataset.h5ad",
     "n_cells": 500,
     "min_n_cells_per_cell_type": 50,
     "cell_type_key": "cell_type",
@@ -45,7 +45,8 @@
     cell_type_indices = adata.obs_names[adata.obs[par["cell_type_key"]] == ct]
     n_cells_in_type = len(cell_type_indices)
     n_per_cell_type[ct] = min(n_cells_in_type, par["min_n_cells_per_cell_type"])
-
+    selected_indices.extend(np.random.choice(cell_type_indices, n_per_cell_type[ct], replace=False))
+
 # Cap the number of cells to sample per cell type by the total number of cells to sample
 #TODO: instead of random choice below, adjust that sum(n_per_cell_type.values()) <= par["n_cells"]