diff --git a/docs/release-notes/0.10.1.md b/docs/release-notes/0.10.1.md index 80e982be..ff4e59a4 100644 --- a/docs/release-notes/0.10.1.md +++ b/docs/release-notes/0.10.1.md @@ -6,6 +6,7 @@ ```{rubric} Bug fixes ``` * updates the behavior of `_check_gpu_X` for `require_cf`. It now only works for `pearson_residuals` calcs and corrects instead of throwing an error {pr}`154` {smaller}`S Dicks` +* Fixes the behavior of `pp.scale` with `mask_obs` and `max_value`. Now only the masked part gets clipped {pr}`158` {smaller}`S Dicks` ```{rubric} Misc ``` diff --git a/src/rapids_singlecell/preprocessing/_scale.py b/src/rapids_singlecell/preprocessing/_scale.py index 70a90c80..b826f98d 100644 --- a/src/rapids_singlecell/preprocessing/_scale.py +++ b/src/rapids_singlecell/preprocessing/_scale.py @@ -87,22 +87,21 @@ def scale( if isinstance(X, cp.ndarray): X, means, std = scale_array( - X, mask_obs=mask_obs, zero_center=zero_center, inplace=inplace + X, + mask_obs=mask_obs, + zero_center=zero_center, + inplace=inplace, + max_value=max_value, ) else: X, means, std = scale_sparse( - X, mask_obs=mask_obs, zero_center=zero_center, inplace=inplace + X, + mask_obs=mask_obs, + zero_center=zero_center, + inplace=inplace, + max_value=max_value, ) - if max_value: - if zero_center: - X = cp.clip(X, a_min=-max_value, a_max=max_value) - else: - if isinstance(X, sparse.spmatrix): - X.data[X.data > max_value] = max_value - else: - X[X > max_value] = max_value - if inplace: _set_obs_rep(adata, X, layer=layer, obsm=obsm) adata.var[str_mean_std[0]] = means.get() @@ -114,7 +113,7 @@ def scale( return X -def scale_array(X, *, mask_obs=None, zero_center=True, inplace=True): +def scale_array(X, *, mask_obs=None, zero_center=True, inplace=True, max_value=None): if not inplace: X = X.copy() if mask_obs is not None: @@ -129,14 +128,26 @@ def scale_array(X, *, mask_obs=None, zero_center=True, inplace=True): if zero_center: X -= mean X /= std + if max_value: + if zero_center: + X = cp.clip(X, a_min=-max_value, a_max=max_value) + else: + X[X > max_value] = max_value + return X, mean, std -def scale_sparse(X, *, mask_obs=None, zero_center=True, inplace=True): +def scale_sparse(X, *, mask_obs=None, zero_center=True, inplace=True, max_value=None): if zero_center: X = X.toarray() # inplace is True because we copied with `toarray` - return scale_array(X, mask_obs=mask_obs, zero_center=zero_center, inplace=True) + return scale_array( + X, + mask_obs=mask_obs, + zero_center=zero_center, + inplace=True, + max_value=max_value, + ) else: if mask_obs is not None: # checking inplace because we are going to update the matrix @@ -147,7 +158,10 @@ def scale_sparse(X, *, mask_obs=None, zero_center=True, inplace=True): X = X.copy() scale_rv = scale_sparse( - X[mask_obs, :], zero_center=zero_center, inplace=True + X[mask_obs, :], + zero_center=zero_center, + inplace=True, + max_value=max_value, ) X_sub, mean, std = scale_rv mask_array = cp.where(cp.array(mask_obs))[0].astype(cp.int32) @@ -188,4 +202,8 @@ def scale_sparse(X, *, mask_obs=None, zero_center=True, inplace=True): ) else: raise ValueError("The sparse matrix must be a CSR or CSC matrix") + + if max_value: + X.data[X.data > max_value] = max_value + return X, mean, std diff --git a/tests/test_scaling.py b/tests/test_scaling.py index b5d7afbc..a204fc97 100644 --- a/tests/test_scaling.py +++ b/tests/test_scaling.py @@ -28,6 +28,11 @@ [1, 0, 1, 0], [0, 0, 0, 0], ]) # with gene std 1,0,1,0 and center 0,0,0,0 +X_scaled_original_clipped = np.array([ + [-1, 1, 0, 0], + [1, 1, 1, 0], + [0, 1, 1, 0], +]) # with gene std 1,0,1,0 and center 0,2,1,0 X_for_mask = np.array([ [27, 27, 27, 27], @@ -56,6 +61,16 @@ [27, 27, 27, 27], [27, 27, 27, 27], ]) +X_scaled_for_mask_clipped = np.array([ + [27, 27, 27, 27], + [27, 27, 27, 27], + [-1, 1, 0, 0], + [1, 1, 1, 0], + [0, 1, 1, 0], + [27, 27, 27, 27], + [27, 27, 27, 27], +]) + @pytest.mark.parametrize("dtype", ["float32", "float64"]) def test_scale_simple(dtype): @@ -121,3 +136,25 @@ def test_clip(zero_center): if zero_center: assert adata.X.min() >= -1 assert adata.X.max() <= 1 + +@pytest.mark.parametrize( + ("mask_obs", "X", "X_scaled", "X_clipped"), + [ + (None, X_original, X_scaled_original, X_scaled_original_clipped), + ( + np.array((0, 0, 1, 1, 1, 0, 0), dtype=bool), + X_for_mask, + X_scaled_for_mask, + X_scaled_for_mask_clipped, + ), + ], +) +def test_scale_sparse(*, mask_obs, X, X_scaled, X_clipped): + adata = AnnData(csr_matrix(X).astype(np.float32)) + adata0 = rsc.get.anndata_to_GPU(adata,copy= True) + rsc.pp.scale(adata0, mask_obs=mask_obs, zero_center=False) + cp.testing.assert_allclose(adata0.X.toarray(), X_scaled) + # test scaling with explicit zero_center == True + adata1 = rsc.get.anndata_to_GPU(adata,copy= True) + rsc.pp.scale(adata1, zero_center=False, mask_obs=mask_obs, max_value=1) + cp.testing.assert_allclose(adata1.X.toarray(), X_clipped)