From 66d1f9f877cca7efe7304bf6609bd4f9f32cabdf Mon Sep 17 00:00:00 2001
From: atarashansky <alexander.tarashansky@czbiohub.org>
Date: Fri, 23 Jul 2021 10:28:30 -0700
Subject: [PATCH 1/6] Added intelligent subsampling module

---
 scrna/subsample.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 scrna/subsample.py

diff --git a/scrna/subsample.py b/scrna/subsample.py
new file mode 100644
index 00000000..7c5dd19c
--- /dev/null
+++ b/scrna/subsample.py
@@ -0,0 +1,67 @@
+import numpy as np
+
+def q(x):
+    return npaarray(list(x))
+
+def subsample(adata,clname1,clname2=None,nc=10000,MIN=15):
+    """
+    This function goes through each cell type label in `clname1` and then makes sure to select a proportional number
+    of each type of cell from `clname2` within that cluster.
+    
+    So, for example, let's say clname1 = tissue annotations, and clname2 = subtype annotations. 
+    This function would go tissue by tissue, and sample a proportional number of cells from each present subtype
+    annotation present within that tissue. This ensures all cell types are included when downsampling.
+    
+    If `clname1=clname2` then it just samples a proportional number of cells from each cluster.
+        
+    Parameters
+    -----------
+    adata : AnnData object
+    
+    clname1 : str
+        Annotation key 1
+
+    clname2 : str, optional, default None
+        Annotation key 2. If None, `clname2` is set equal to `clname1`
+        
+    nc : int, optional, default 10000
+        Number of cells to target for downsampling. The final number of cells may be slightly lower.
+        Tune `nc` accordingly if you'd like to hit an exact target.
+        
+    MIN : int, optional, default 15
+        Minimum cluster size after downsampling
+    
+    Returns
+    -------
+    AnnData - A new subsetted AnnData object.
+    """
+    if clname2 is None:
+        clname2 = clname1
+    
+    frac = nc/adata.shape[0]
+    cu = q(adata.obs[clname1])
+    cuu = np.unique(cu)
+    CELLS=[]
+    obsn = q(adata.obs_names)
+    for i in range(cuu.size):
+        print(i)
+        ix = np.where(cu==cuu[i])[0]
+        a1 = adata[ix,:]
+        lc = q(a1.obs[clname2])
+        lcu,lcuc = np.unique(lc,return_counts=True)
+        z=0
+        cells=[]
+        for j in range(lcu.size):
+            CELLS.append(q(obsn[ix[np.where(lc==lcu[j])[0][np.random.choice(lcuc[j],replace=False,size=int(lcuc[j]*frac))]]]))
+            cells.append(CELLS[-1])
+        z = np.concatenate(cells).size
+        obsnc = obsn[ix]
+
+        if min(MIN,obsnc.size) - z > 0:
+            xx = obsnc[np.in1d(obsnc,np.concatenate(cells),invert=True)]
+            CELLS.append(np.random.choice(xx,replace=False,size = min(xx.size,min(30,obsnc.size) - z)))
+            print(len(CELLS[-1]))
+    CELLS=np.concatenate(CELLS)
+    assert np.unique(CELLS,return_counts=True)[1].max()==1
+    CELLS = q(adata.obs_names)[np.in1d(q(adata.obs_names),CELLS)]
+    return adata[CELLS].copy()
\ No newline at end of file

From b589082d5a6de11f8b1d72de7164235a72c8f6d2 Mon Sep 17 00:00:00 2001
From: atarashansky <alexander.tarashansky@czbiohub.org>
Date: Fri, 23 Jul 2021 10:35:11 -0700
Subject: [PATCH 2/6] fixed typo

---
 scrna/subsample.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scrna/subsample.py b/scrna/subsample.py
index 7c5dd19c..b26766d8 100644
--- a/scrna/subsample.py
+++ b/scrna/subsample.py
@@ -1,7 +1,7 @@
 import numpy as np
 
 def q(x):
-    return npaarray(list(x))
+    return np.array(list(x))
 
 def subsample(adata,clname1,clname2=None,nc=10000,MIN=15):
     """

From 55459c48593442f634625125eb459d1bd0518fe2 Mon Sep 17 00:00:00 2001
From: atarashansky <alexander.tarashansky@czbiohub.org>
Date: Fri, 23 Jul 2021 10:39:01 -0700
Subject: [PATCH 3/6] reorganized file locations

---
 src/utilities/demux/.__init__.py.swp        | Bin 0 -> 1024 bytes
 src/utilities/scrna/__init__.py             |   0
 {scrna => src/utilities/scrna}/subsample.py |   0
 3 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 src/utilities/demux/.__init__.py.swp
 create mode 100644 src/utilities/scrna/__init__.py
 rename {scrna => src/utilities/scrna}/subsample.py (100%)

diff --git a/src/utilities/demux/.__init__.py.swp b/src/utilities/demux/.__init__.py.swp
new file mode 100644
index 0000000000000000000000000000000000000000..50b695f49b508f6364e14f019dc7125c69529801
GIT binary patch
literal 1024
zcmYc?$V<%2S1{7E)H7y40`mVE7!q?*laYjQGILT>a}sgNA|&GDGxIV_;^XxSDiMOC
NtkDn{4S@j*0RWJb3|0UD

literal 0
HcmV?d00001

diff --git a/src/utilities/scrna/__init__.py b/src/utilities/scrna/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/scrna/subsample.py b/src/utilities/scrna/subsample.py
similarity index 100%
rename from scrna/subsample.py
rename to src/utilities/scrna/subsample.py

From 5c63ce2202207aa92cecca2cadb9b635f30e4f5e Mon Sep 17 00:00:00 2001
From: atarashansky <alexander.tarashansky@czbiohub.org>
Date: Fri, 23 Jul 2021 10:39:40 -0700
Subject: [PATCH 4/6] removed bad file

---
 src/utilities/demux/.__init__.py.swp | Bin 1024 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 src/utilities/demux/.__init__.py.swp

diff --git a/src/utilities/demux/.__init__.py.swp b/src/utilities/demux/.__init__.py.swp
deleted file mode 100644
index 50b695f49b508f6364e14f019dc7125c69529801..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1024
zcmYc?$V<%2S1{7E)H7y40`mVE7!q?*laYjQGILT>a}sgNA|&GDGxIV_;^XxSDiMOC
NtkDn{4S@j*0RWJb3|0UD


From 0cbfe0a486ed728ead55226325c8168e267155a0 Mon Sep 17 00:00:00 2001
From: atarashansky <alexander.tarashansky@czbiohub.org>
Date: Fri, 23 Jul 2021 10:53:43 -0700
Subject: [PATCH 5/6] update

---
 src/utilities/scrna/subsample.py | 47 +++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/src/utilities/scrna/subsample.py b/src/utilities/scrna/subsample.py
index b26766d8..2ff180a9 100644
--- a/src/utilities/scrna/subsample.py
+++ b/src/utilities/scrna/subsample.py
@@ -3,7 +3,7 @@
 def q(x):
     return np.array(list(x))
 
-def subsample(adata,clname1,clname2=None,nc=10000,MIN=15):
+def subsample(adata,clname1,clname2=None,nc=10000,MIN=15,exclude=[],verbose=False):
     """
     This function goes through each cell type label in `clname1` and then makes sure to select a proportional number
     of each type of cell from `clname2` within that cluster.
@@ -25,11 +25,18 @@ def subsample(adata,clname1,clname2=None,nc=10000,MIN=15):
         Annotation key 2. If None, `clname2` is set equal to `clname1`
         
     nc : int, optional, default 10000
-        Number of cells to target for downsampling. The final number of cells may be slightly lower.
-        Tune `nc` accordingly if you'd like to hit an exact target.
+        Number of cells to target for downsampling. The final number of cells may be different
+        due to the `MIN` parameter.
         
     MIN : int, optional, default 15
         Minimum cluster size after downsampling
+        
+    exclude : list, optional, default []
+        A list of labels to exclude from downsampling. Typically, you'd set
+        exclude=['nan'].
+        
+    verbose : bool, optional, default False
+        If True, outputs subsampling progress.
     
     Returns
     -------
@@ -44,23 +51,25 @@ def subsample(adata,clname1,clname2=None,nc=10000,MIN=15):
     CELLS=[]
     obsn = q(adata.obs_names)
     for i in range(cuu.size):
-        print(i)
-        ix = np.where(cu==cuu[i])[0]
-        a1 = adata[ix,:]
-        lc = q(a1.obs[clname2])
-        lcu,lcuc = np.unique(lc,return_counts=True)
-        z=0
-        cells=[]
-        for j in range(lcu.size):
-            CELLS.append(q(obsn[ix[np.where(lc==lcu[j])[0][np.random.choice(lcuc[j],replace=False,size=int(lcuc[j]*frac))]]]))
-            cells.append(CELLS[-1])
-        z = np.concatenate(cells).size
-        obsnc = obsn[ix]
+        if cuu[i] not in exclude:        
+            if (verbose):
+                print('Subsampling cluster',cuu[i],end='\033[1\ \r')
+
+            ix = np.where(cu==cuu[i])[0]
+            a1 = adata[ix,:]
+            lc = q(a1.obs[clname2])
+            lcu,lcuc = np.unique(lc,return_counts=True)
+            z=0
+            cells=[]
+            for j in range(lcu.size):
+                CELLS.append(q(obsn[ix[np.where(lc==lcu[j])[0][np.random.choice(lcuc[j],replace=False,size=int(lcuc[j]*frac))]]]))
+                cells.append(CELLS[-1])
+            z = np.concatenate(cells).size
+            obsnc = obsn[ix]
 
-        if min(MIN,obsnc.size) - z > 0:
-            xx = obsnc[np.in1d(obsnc,np.concatenate(cells),invert=True)]
-            CELLS.append(np.random.choice(xx,replace=False,size = min(xx.size,min(30,obsnc.size) - z)))
-            print(len(CELLS[-1]))
+            if min(MIN,obsnc.size) - z > 0:
+                xx = obsnc[np.in1d(obsnc,np.concatenate(cells),invert=True)]
+                CELLS.append(np.random.choice(xx,replace=False,size = min(xx.size,min(MIN,obsnc.size) - z)))
     CELLS=np.concatenate(CELLS)
     assert np.unique(CELLS,return_counts=True)[1].max()==1
     CELLS = q(adata.obs_names)[np.in1d(q(adata.obs_names),CELLS)]

From 4a64e8e172dec44ca2c168aa1b6d490b862dd9d9 Mon Sep 17 00:00:00 2001
From: atarashansky <alexander.tarashansky@czbiohub.org>
Date: Fri, 23 Jul 2021 11:23:55 -0700
Subject: [PATCH 6/6] final update

---
 src/utilities/scrna/subsample.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/utilities/scrna/subsample.py b/src/utilities/scrna/subsample.py
index 2ff180a9..0c78dd35 100644
--- a/src/utilities/scrna/subsample.py
+++ b/src/utilities/scrna/subsample.py
@@ -72,5 +72,7 @@ def subsample(adata,clname1,clname2=None,nc=10000,MIN=15,exclude=[],verbose=Fals
                 CELLS.append(np.random.choice(xx,replace=False,size = min(xx.size,min(MIN,obsnc.size) - z)))
     CELLS=np.concatenate(CELLS)
     assert np.unique(CELLS,return_counts=True)[1].max()==1
+    print('Downsampled to',CELLS.size,'cells',end='\033[1\ ')
+    
     CELLS = q(adata.obs_names)[np.in1d(q(adata.obs_names),CELLS)]
     return adata[CELLS].copy()
\ No newline at end of file