From 66d1f9f877cca7efe7304bf6609bd4f9f32cabdf Mon Sep 17 00:00:00 2001 From: atarashansky Date: Fri, 23 Jul 2021 10:28:30 -0700 Subject: [PATCH 1/6] Added intelligent subsampling module --- scrna/subsample.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 scrna/subsample.py diff --git a/scrna/subsample.py b/scrna/subsample.py new file mode 100644 index 00000000..7c5dd19c --- /dev/null +++ b/scrna/subsample.py @@ -0,0 +1,67 @@ +import numpy as np + +def q(x): + return npaarray(list(x)) + +def subsample(adata,clname1,clname2=None,nc=10000,MIN=15): + """ + This function goes through each cell type label in `clname1` and then makes sure to select a proportional number + of each type of cell from `clname2` within that cluster. + + So, for example, let's say clname1 = tissue annotations, and clname2 = subtype annotations. + This function would go tissue by tissue, and sample a proportional number of cells from each present subtype + annotation present within that tissue. This ensures all cell types are included when downsampling. + + If `clname1=clname2` then it just samples a proportional number of cells from each cluster. + + Parameters + ----------- + adata : AnnData object + + clname1 : str + Annotation key 1 + + clname2 : str, optional, default None + Annotation key 2. If None, `clname2` is set equal to `clname1` + + nc : int, optional, default 10000 + Number of cells to target for downsampling. The final number of cells may be slightly lower. + Tune `nc` accordingly if you'd like to hit an exact target. + + MIN : int, optional, default 15 + Minimum cluster size after downsampling + + Returns + ------- + AnnData - A new subsetted AnnData object. + """ + if clname2 is None: + clname2 = clname1 + + frac = nc/adata.shape[0] + cu = q(adata.obs[clname1]) + cuu = np.unique(cu) + CELLS=[] + obsn = q(adata.obs_names) + for i in range(cuu.size): + print(i) + ix = np.where(cu==cuu[i])[0] + a1 = adata[ix,:] + lc = q(a1.obs[clname2]) + lcu,lcuc = np.unique(lc,return_counts=True) + z=0 + cells=[] + for j in range(lcu.size): + CELLS.append(q(obsn[ix[np.where(lc==lcu[j])[0][np.random.choice(lcuc[j],replace=False,size=int(lcuc[j]*frac))]]])) + cells.append(CELLS[-1]) + z = np.concatenate(cells).size + obsnc = obsn[ix] + + if min(MIN,obsnc.size) - z > 0: + xx = obsnc[np.in1d(obsnc,np.concatenate(cells),invert=True)] + CELLS.append(np.random.choice(xx,replace=False,size = min(xx.size,min(30,obsnc.size) - z))) + print(len(CELLS[-1])) + CELLS=np.concatenate(CELLS) + assert np.unique(CELLS,return_counts=True)[1].max()==1 + CELLS = q(adata.obs_names)[np.in1d(q(adata.obs_names),CELLS)] + return adata[CELLS].copy() \ No newline at end of file From b589082d5a6de11f8b1d72de7164235a72c8f6d2 Mon Sep 17 00:00:00 2001 From: atarashansky Date: Fri, 23 Jul 2021 10:35:11 -0700 Subject: [PATCH 2/6] fixed typo --- scrna/subsample.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrna/subsample.py b/scrna/subsample.py index 7c5dd19c..b26766d8 100644 --- a/scrna/subsample.py +++ b/scrna/subsample.py @@ -1,7 +1,7 @@ import numpy as np def q(x): - return npaarray(list(x)) + return np.array(list(x)) def subsample(adata,clname1,clname2=None,nc=10000,MIN=15): """ From 55459c48593442f634625125eb459d1bd0518fe2 Mon Sep 17 00:00:00 2001 From: atarashansky Date: Fri, 23 Jul 2021 10:39:01 -0700 Subject: [PATCH 3/6] reorganized file locations --- src/utilities/demux/.__init__.py.swp | Bin 0 -> 1024 bytes src/utilities/scrna/__init__.py | 0 {scrna => src/utilities/scrna}/subsample.py | 0 3 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 src/utilities/demux/.__init__.py.swp create mode 100644 src/utilities/scrna/__init__.py rename {scrna => src/utilities/scrna}/subsample.py (100%) diff --git a/src/utilities/demux/.__init__.py.swp b/src/utilities/demux/.__init__.py.swp new file mode 100644 index 0000000000000000000000000000000000000000..50b695f49b508f6364e14f019dc7125c69529801 GIT binary patch literal 1024 zcmYc?$V<%2S1{7E)H7y40`mVE7!q?*laYjQGILT>a}sgNA|&GDGxIV_;^XxSDiMOC NtkDn{4S@j*0RWJb3|0UD literal 0 HcmV?d00001 diff --git a/src/utilities/scrna/__init__.py b/src/utilities/scrna/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/scrna/subsample.py b/src/utilities/scrna/subsample.py similarity index 100% rename from scrna/subsample.py rename to src/utilities/scrna/subsample.py From 5c63ce2202207aa92cecca2cadb9b635f30e4f5e Mon Sep 17 00:00:00 2001 From: atarashansky Date: Fri, 23 Jul 2021 10:39:40 -0700 Subject: [PATCH 4/6] removed bad file --- src/utilities/demux/.__init__.py.swp | Bin 1024 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 src/utilities/demux/.__init__.py.swp diff --git a/src/utilities/demux/.__init__.py.swp b/src/utilities/demux/.__init__.py.swp deleted file mode 100644 index 50b695f49b508f6364e14f019dc7125c69529801..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1024 zcmYc?$V<%2S1{7E)H7y40`mVE7!q?*laYjQGILT>a}sgNA|&GDGxIV_;^XxSDiMOC NtkDn{4S@j*0RWJb3|0UD From 0cbfe0a486ed728ead55226325c8168e267155a0 Mon Sep 17 00:00:00 2001 From: atarashansky Date: Fri, 23 Jul 2021 10:53:43 -0700 Subject: [PATCH 5/6] update --- src/utilities/scrna/subsample.py | 47 +++++++++++++++++++------------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/src/utilities/scrna/subsample.py b/src/utilities/scrna/subsample.py index b26766d8..2ff180a9 100644 --- a/src/utilities/scrna/subsample.py +++ b/src/utilities/scrna/subsample.py @@ -3,7 +3,7 @@ def q(x): return np.array(list(x)) -def subsample(adata,clname1,clname2=None,nc=10000,MIN=15): +def subsample(adata,clname1,clname2=None,nc=10000,MIN=15,exclude=[],verbose=False): """ This function goes through each cell type label in `clname1` and then makes sure to select a proportional number of each type of cell from `clname2` within that cluster. @@ -25,11 +25,18 @@ def subsample(adata,clname1,clname2=None,nc=10000,MIN=15): Annotation key 2. If None, `clname2` is set equal to `clname1` nc : int, optional, default 10000 - Number of cells to target for downsampling. The final number of cells may be slightly lower. - Tune `nc` accordingly if you'd like to hit an exact target. + Number of cells to target for downsampling. The final number of cells may be different + due to the `MIN` parameter. MIN : int, optional, default 15 Minimum cluster size after downsampling + + exclude : list, optional, default [] + A list of labels to exclude from downsampling. Typically, you'd set + exclude=['nan']. + + verbose : bool, optional, default False + If True, outputs subsampling progress. Returns ------- @@ -44,23 +51,25 @@ def subsample(adata,clname1,clname2=None,nc=10000,MIN=15): CELLS=[] obsn = q(adata.obs_names) for i in range(cuu.size): - print(i) - ix = np.where(cu==cuu[i])[0] - a1 = adata[ix,:] - lc = q(a1.obs[clname2]) - lcu,lcuc = np.unique(lc,return_counts=True) - z=0 - cells=[] - for j in range(lcu.size): - CELLS.append(q(obsn[ix[np.where(lc==lcu[j])[0][np.random.choice(lcuc[j],replace=False,size=int(lcuc[j]*frac))]]])) - cells.append(CELLS[-1]) - z = np.concatenate(cells).size - obsnc = obsn[ix] + if cuu[i] not in exclude: + if (verbose): + print('Subsampling cluster',cuu[i],end='\033[1\ \r') + + ix = np.where(cu==cuu[i])[0] + a1 = adata[ix,:] + lc = q(a1.obs[clname2]) + lcu,lcuc = np.unique(lc,return_counts=True) + z=0 + cells=[] + for j in range(lcu.size): + CELLS.append(q(obsn[ix[np.where(lc==lcu[j])[0][np.random.choice(lcuc[j],replace=False,size=int(lcuc[j]*frac))]]])) + cells.append(CELLS[-1]) + z = np.concatenate(cells).size + obsnc = obsn[ix] - if min(MIN,obsnc.size) - z > 0: - xx = obsnc[np.in1d(obsnc,np.concatenate(cells),invert=True)] - CELLS.append(np.random.choice(xx,replace=False,size = min(xx.size,min(30,obsnc.size) - z))) - print(len(CELLS[-1])) + if min(MIN,obsnc.size) - z > 0: + xx = obsnc[np.in1d(obsnc,np.concatenate(cells),invert=True)] + CELLS.append(np.random.choice(xx,replace=False,size = min(xx.size,min(MIN,obsnc.size) - z))) CELLS=np.concatenate(CELLS) assert np.unique(CELLS,return_counts=True)[1].max()==1 CELLS = q(adata.obs_names)[np.in1d(q(adata.obs_names),CELLS)] From 4a64e8e172dec44ca2c168aa1b6d490b862dd9d9 Mon Sep 17 00:00:00 2001 From: atarashansky Date: Fri, 23 Jul 2021 11:23:55 -0700 Subject: [PATCH 6/6] final update --- src/utilities/scrna/subsample.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/utilities/scrna/subsample.py b/src/utilities/scrna/subsample.py index 2ff180a9..0c78dd35 100644 --- a/src/utilities/scrna/subsample.py +++ b/src/utilities/scrna/subsample.py @@ -72,5 +72,7 @@ def subsample(adata,clname1,clname2=None,nc=10000,MIN=15,exclude=[],verbose=Fals CELLS.append(np.random.choice(xx,replace=False,size = min(xx.size,min(MIN,obsnc.size) - z))) CELLS=np.concatenate(CELLS) assert np.unique(CELLS,return_counts=True)[1].max()==1 + print('Downsampled to',CELLS.size,'cells',end='\033[1\ ') + CELLS = q(adata.obs_names)[np.in1d(q(adata.obs_names),CELLS)] return adata[CELLS].copy() \ No newline at end of file