Skip to content

Commit

Permalink
refactoring as python package
Browse files Browse the repository at this point in the history
  • Loading branch information
Brian Hie committed Nov 18, 2018
1 parent 05d9c97 commit 5e8cc78
Show file tree
Hide file tree
Showing 17 changed files with 185 additions and 95 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
*.txt
*~
R/*
ample.egg-info/
build/
data*
dist/
plots*
target*
temp*
1 change: 1 addition & 0 deletions ample/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .sketch import *
File renamed without changes.
171 changes: 87 additions & 84 deletions bin/sketch.py → ample/sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
from sklearn.metrics.pairwise import pairwise_distances
import sys

from kmeanspp import kmeanspp
from utils import log
from .kmeanspp import kmeanspp
from .utils import log

def gs(X, N, k='auto', seed=None, replace=False,
alpha=0.1, max_iter=200, verbose=0, labels=None):
def gs(X, N, **kwargs):
return gs_gap(X, N, **kwargs)

def gs_gap(X, N, k='auto', seed=None, replace=False,
alpha=0.1, max_iter=200, verbose=0, labels=None):
n_samples, n_features = X.shape

# Error checking and initialization.
Expand All @@ -24,7 +27,9 @@ def gs(X, N, k='auto', seed=None, replace=False,
X -= X.min(0)
X /= X.max()

low_unit, high_unit = 0., np.max(X)
X_ptp = X.ptp(0)

low_unit, high_unit = 0., max(X_ptp)

unit = (low_unit + high_unit) / 4.

Expand All @@ -34,22 +39,31 @@ def gs(X, N, k='auto', seed=None, replace=False,
if verbose > 1:
log('n_iter = {}'.format(n_iter))

grid = {}

unit_d = unit# * n_features
grid_table = np.zeros((n_samples, n_features))

for sample_idx in range(n_samples):
if verbose > 1:
if sample_idx % 10000 == 0:
log('sample_idx = {}'.format(sample_idx))

sample = X[sample_idx, :]
for d in range(n_features):
if X_ptp[d] <= unit:
continue

points_d = X[:, d]
curr_start = None
curr_interval = -1
for sample_idx in np.argsort(points_d):
if curr_start is None or \
curr_start + unit < points_d[sample_idx]:
curr_start = points_d[sample_idx]
curr_interval += 1
grid_table[sample_idx, d] = curr_interval

grid_cell = tuple(np.floor(sample / unit_d).astype(int))
grid = {}

for sample_idx in range(n_samples):
grid_cell = tuple(grid_table[sample_idx, :])
if grid_cell not in grid:
grid[grid_cell] = set()
grid[grid_cell].add(sample_idx)
grid[grid_cell] = []
grid[grid_cell].append(sample_idx)

del grid_table

if verbose:
log('Found {} non-empty grid cells'.format(len(grid)))
Expand Down Expand Up @@ -93,7 +107,7 @@ def gs(X, N, k='auto', seed=None, replace=False,

if verbose:
log('Found {} grid cells'.format(len(grid)))

gs_idx = []
for n in range(N):
grid_cells = list(grid.keys())
Expand All @@ -108,47 +122,8 @@ def gs(X, N, k='auto', seed=None, replace=False,

return sorted(gs_idx)

def gs_exact(X, N, k='auto', seed=None, replace=False,
tol=1e-3, n_iter=300, verbose=1):
ge_idx = gs(X, N, replace=replace)

dist = pairwise_distances(X, n_jobs=-1)

cost = dist.max()

iter_i = 0

while iter_i < n_iter:

if verbose:
log('iter_i = {}'.format(iter_i))

labels = np.argmin(dist[ge_idx, :], axis=0)

ge_idx_new = []
for cluster in range(N):
cluster_idx = np.nonzero(labels == cluster)[0]
if len(cluster_idx) == 0:
ge_idx_new.append(ge_idx[cluster])
continue
X_cluster = dist[cluster_idx, :]
X_cluster = X_cluster[:, cluster_idx]
within_idx = np.argmin(X_cluster.max(0))
ge_idx_new.append(cluster_idx[within_idx])
ge_idx = ge_idx_new

cost, prev_cost = dist[ge_idx, :].min(0).max(), cost
assert(cost <= prev_cost)

if prev_cost - cost < tol:
break

iter_i += 1

return ge_idx

def gs_gap(X, N, k='auto', seed=None, replace=False,
alpha=0.1, max_iter=200, verbose=0, labels=None):
def gs_grid(X, N, k='auto', seed=None, replace=False,
alpha=0.1, max_iter=200, verbose=0, labels=None):
n_samples, n_features = X.shape

# Error checking and initialization.
Expand All @@ -165,9 +140,7 @@ def gs_gap(X, N, k='auto', seed=None, replace=False,
X -= X.min(0)
X /= X.max()

X_ptp = X.ptp(0)

low_unit, high_unit = 0., max(X_ptp)
low_unit, high_unit = 0., np.max(X)

unit = (low_unit + high_unit) / 4.

Expand All @@ -177,31 +150,22 @@ def gs_gap(X, N, k='auto', seed=None, replace=False,
if verbose > 1:
log('n_iter = {}'.format(n_iter))

grid_table = np.zeros((n_samples, n_features))

for d in range(n_features):
if X_ptp[d] <= unit:
continue

points_d = X[:, d]
curr_start = None
curr_interval = -1
for sample_idx in np.argsort(points_d):
if curr_start is None or \
curr_start + unit < points_d[sample_idx]:
curr_start = points_d[sample_idx]
curr_interval += 1
grid_table[sample_idx, d] = curr_interval

grid = {}

unit_d = unit# * n_features

for sample_idx in range(n_samples):
grid_cell = tuple(grid_table[sample_idx, :])
if grid_cell not in grid:
grid[grid_cell] = []
grid[grid_cell].append(sample_idx)
if verbose > 1:
if sample_idx % 10000 == 0:
log('sample_idx = {}'.format(sample_idx))

sample = X[sample_idx, :]

del grid_table
grid_cell = tuple(np.floor(sample / unit_d).astype(int))

if grid_cell not in grid:
grid[grid_cell] = set()
grid[grid_cell].add(sample_idx)

if verbose:
log('Found {} non-empty grid cells'.format(len(grid)))
Expand Down Expand Up @@ -245,7 +209,7 @@ def gs_gap(X, N, k='auto', seed=None, replace=False,

if verbose:
log('Found {} grid cells'.format(len(grid)))

gs_idx = []
for n in range(N):
grid_cells = list(grid.keys())
Expand All @@ -260,6 +224,45 @@ def gs_gap(X, N, k='auto', seed=None, replace=False,

return sorted(gs_idx)

def gs_exact(X, N, k='auto', seed=None, replace=False,
tol=1e-3, n_iter=300, verbose=1):
ge_idx = gs(X, N, replace=replace)

dist = pairwise_distances(X, n_jobs=-1)

cost = dist.max()

iter_i = 0

while iter_i < n_iter:

if verbose:
log('iter_i = {}'.format(iter_i))

labels = np.argmin(dist[ge_idx, :], axis=0)

ge_idx_new = []
for cluster in range(N):
cluster_idx = np.nonzero(labels == cluster)[0]
if len(cluster_idx) == 0:
ge_idx_new.append(ge_idx[cluster])
continue
X_cluster = dist[cluster_idx, :]
X_cluster = X_cluster[:, cluster_idx]
within_idx = np.argmin(X_cluster.max(0))
ge_idx_new.append(cluster_idx[within_idx])
ge_idx = ge_idx_new

cost, prev_cost = dist[ge_idx, :].min(0).max(), cost
assert(cost <= prev_cost)

if prev_cost - cost < tol:
break

iter_i += 1

return ge_idx

def srs(X, N, seed=None, replace=False, prenormalized=False):
n_samples, n_features = X.shape

Expand Down
58 changes: 58 additions & 0 deletions ample/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import errno
from fbpca import pca
import datetime
import numpy as np
import os
from sklearn.random_projection import SparseRandomProjection as JLSparse
import sys

# Default parameters.
DIMRED = 100

def log(string):
string = str(string)
sys.stdout.write(str(datetime.datetime.now()) + ' | [ample] ')
sys.stdout.write(string + '\n')
sys.stdout.flush()

def reduce_dimensionality(X, method='svd', dimred=DIMRED, raw=False):
if method == 'svd':
k = min((dimred, X.shape[0], X.shape[1]))
U, s, Vt = pca(X, k=k, raw=raw)
return U[:, range(k)] * s[range(k)]
elif method == 'jl_sparse':
jls = JLSparse(n_components=dimred)
return jls.fit_transform(X).toarray()
elif method == 'hvg':
X = X.tocsc()
disp = dispersion(X)
highest_disp_idx = np.argsort(disp)[::-1][:dimred]
return X[:, highest_disp_idx].toarray()
else:
sys.stderr.write('ERROR: Unknown method {}.'.format(svd))
exit(1)

def dispersion(X, eps=1e-10):
mean = X.mean(0).A1

X_nonzero = X[:, mean > eps]
nonzero_mean = X_nonzero.mean(0).A1
nonzero_var = (X_nonzero.multiply(X_nonzero)).mean(0).A1
del X_nonzero

nonzero_dispersion = (nonzero_var / nonzero_mean)

dispersion = np.zeros(X.shape[1])
dispersion[mean > eps] = nonzero_dispersion
dispersion[mean <= eps] = float('-inf')

return dispersion

def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc: # Python >2.5
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
5 changes: 5 additions & 0 deletions bin/artificial_density.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@
X_dimred = np.concatenate(Xs)
cell_labels = np.array(labels, dtype=int)

from ample import gs, gs_gap
samp_idx = gs_gap(X_dimred, 3000, replace=True, verbose=10)
report_cluster_counts(cell_labels[samp_idx])
exit()

experiments(
X_dimred, NAMESPACE,
rare=True, cell_labels=cell_labels, rare_label=2,
Expand Down
2 changes: 1 addition & 1 deletion bin/benchmark_samplers.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from experiments import dropclust_preprocess, dropclust_sample
from save_mtx import save_mtx
from sketch import *
from ample import *
from utils import log

if __name__ == '__main__':
Expand Down
4 changes: 2 additions & 2 deletions bin/experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

from process import load_names
from save_mtx import save_mtx
from sketch import *
from ample import *
from utils import *

# Clustering-based downsampling efficiency.
Expand Down Expand Up @@ -224,7 +224,7 @@ def experiments(X_dimred, name, n_seeds=10, **kwargs):

sampling_fns = [
uniform,
gs,
gs_grid,
gs_gap,
srs,
louvain1,
Expand Down
2 changes: 1 addition & 1 deletion bin/hema.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def keep_valid():
for idx in range(X.shape[0]):
of.write('hematopoeisis{}\t{}'.format(idx, cell_names[idx]))

from sketch import *
from ample import *

gs_idx = louvain1(X_dimred, 300, replace=True)
write_table(X[gs_idx, :].toarray(), genes, 'data/pseudotime/' + NAMESPACE + '_gs')
Expand Down
2 changes: 1 addition & 1 deletion bin/ica.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
else:
X_dimred = np.loadtxt('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE))

from sketch import gs
from ample import gs
samp_idx = gs(X_dimred, 1000, replace=False)
save_sketch(X, samp_idx, genes, NAMESPACE + '1000')

Expand Down
2 changes: 1 addition & 1 deletion bin/monocyte_macrophage.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def keep_valid():
for idx in range(X.shape[0]):
of.write('mono_macro{}\t{}'.format(idx, cell_names[idx]))

from sketch import gs, gs_gap, uniform
from ample import gs, gs_gap, uniform

gs_idx = gs(X_dimred, 110, replace=False)
write_table(X[gs_idx, :].toarray(), genes, 'data/pseudotime/' + NAMESPACE + '_gs')
Expand Down
2 changes: 1 addition & 1 deletion bin/mouse_brain.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def keep_valid(datasets):
kmeans=False, visualize_orig=False
)

from sketch import gs
from ample import gs
samp_idx = gs(X_dimred, 1000, replace=False)
save_sketch(X, samp_idx, genes, NAMESPACE + '1000')

Expand Down
2 changes: 1 addition & 1 deletion bin/mouse_brain_astrocyte.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def auroc(X, genes, labels, focus, background=None):
else:
X_dimred = np.loadtxt('data/dimred/{}_{}.txt'.format(METHOD, NAMESPACE))

from sketch import gs
from ample import gs
samp_idx = gs(X_dimred, 20000, replace=False)

#from anndata import AnnData
Expand Down
Loading

0 comments on commit 5e8cc78

Please sign in to comment.