Use feature_mask for logcellcounts (#6)

* BREAKING: scparams now uses feature_mask to compute logcellcounts * logcellcounts now uses feature_mask * sctransform now takes feature_mask parameter * Use StableHashTraits for hashing instead of custom solution * Bump internal hash version due to scparam changes * Add CHANGELOG
rasmushenningsson · Jun 18, 2024 · 14cd387 · 14cd387 · rasmushenningsson · Jun 18, 2024
1 parent d1d2796
commit 14cd387
Show file tree

Hide file tree

Showing 7 changed files with 122 additions and 68 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,16 @@
+# SCTransform.jl changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+## [0.2.0] - 2024-06-18
+
+### Breaking
+
+* `scparams` now uses `feature_mask` when computing `logcellcounts`. This is important if there are multiple modalities in the data (e.g. Gene Expression and Antibody counts.)
+* `sctransform` now takes `feature_mask` parameter, which controls which features are used to compute `logcellcounts`. Defaults to only using "Gene Expression" features.
+* Caching of `scparams` results now uses `StableHashTraits` for hashing.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "SCTransform"
 uuid = "f56ea72b-d1ea-403e-9c0c-95d94e8a8532"
 authors = ["Rasmus Henningsson <[email protected]>"]
-version = "0.1.3"
+version = "0.2.0"
 
 [deps]
 CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
@@ -14,6 +14,7 @@ SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 Scratch = "6c6a2e73-6563-6170-7368-637461726353"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+StableHashTraits = "c5dd0088-6c3f-4803-b00e-f31a60c170fa"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 
 [compat]
@@ -28,13 +29,16 @@ SHA = "0.7"
 Scratch = "1.1"
 SparseArrays = "1"
 SpecialFunctions = "0.10, 1, 2"
+StableHashTraits = "1.2.0"
 Statistics = "1"
 julia = "1.8"
 
 [extras]
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SingleCell10x = "eddce310-d14b-4c2d-aa06-cfe9e2f7af98"
+StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "SingleCell10x", "DataFrames"]
+test = ["Test", "SingleCell10x", "DataFrames", "Random", "StableRNGs"]
diff --git a/src/SCTransform.jl b/src/SCTransform.jl
@@ -15,6 +15,7 @@ using Scratch
 using SHA
 using DelimitedFiles
 using CodecZlib
+using StableHashTraits
 
 include("utils.jl")
 include("table.jl")

diff --git a/src/cache.jl b/src/cache.jl
@@ -1,57 +1,11 @@
-# Here is a 64 bit hash implementation that
-# * Is stable across versions (unless we choose to change it ourselves)
-# * Takes all elements of a vector into account
-# * Is reasonable fast (otherwise we would've used sha256)
-
-function _hash(n::UInt64) # copied from Julia Base hash_64_64 (MIT license)
-    a::UInt64 = n
-    a = ~a + a << 21
-    a =  a ⊻ a >> 24
-    a =  a + a << 3 + a << 8
-    a =  a ⊻ a >> 14
-    a =  a + a << 2 + a << 4
-    a =  a ⊻ a >> 28
-    a =  a + a << 31
-    return a
-end
-_hash(n::UInt32) = _hash(UInt64(n))
-_hash(n::Union{Bool,UInt32}) = _hash(UInt64(n))
-
-_hash(n,h::UInt64) = _hash(n) - 3h
-
-function _fullhash_impl(v, h=UInt(0))
-	for x in v
-		h = _hash(x,h)
-	end
-	h
-end
-
-_fullhash(v::AbstractVector{Float64}, h=UInt64(0)) = _fullhash_impl(reinterpret(UInt64,v), h)
-_fullhash(v::AbstractVector{Int64}, h=UInt64(0)) = _fullhash_impl(reinterpret(UInt64,v), h)
-_fullhash(v::AbstractVector{Float32}, h=UInt64(0)) = _fullhash_impl(reinterpret(UInt32,v), h)
-_fullhash(v::AbstractVector{Int32}, h=UInt64(0)) = _fullhash_impl(reinterpret(UInt32,v), h)
-_fullhash(v, h=UInt64(0)) = _fullhash_impl(v, h)
-
-
-
-const SCPARAMS_VERSION = v"0.1.0"
+const SCPARAMS_VERSION = v"0.2.0"
 
-function _scparams_checksum(X::SparseMatrixCSC,method,min_cells,feature_mask)
+function _scparams_checksum(X::SparseMatrixCSC, method::Symbol, min_cells::Int, feature_mask::BitVector)
 	@assert method in (:poisson, :nb) "Method must be :poisson or :nb"
 	P,N = size(X)
 
-	h = String[]
-	push!(h, string(P))
-	push!(h, string(N))
-	push!(h, string(_fullhash(X.colptr)))
-	push!(h, string(_fullhash(X.rowval)))
-	push!(h, string(_fullhash(X.nzval)))
-	push!(h, string(method))
-	push!(h, string(min_cells))
-	push!(h, string(_fullhash(feature_mask)))
-	push!(h, string(SCPARAMS_VERSION))
-
-	h = bytes2hex(sha256(join(h,"__<sep>__")))
+	tup = (P, N, X.colptr, X.rowval, X.nzval, method, min_cells, feature_mask, SCPARAMS_VERSION)
+	bytes2hex(stable_hash(tup; version=3))
 end
 
 function _scparams_cache_convert_column(data, type)

diff --git a/src/params.jl b/src/params.jl
@@ -4,7 +4,29 @@
 # logcellcounts(X::SparseMatrixCSC) = log10.(max.(1,vec(sum(X;dims=2))))
 
 # Assumes each column is a cell
-logcellcounts(X::SparseMatrixCSC) = log10.(max.(1,vec(sum(X;dims=1))))
+# logcellcounts(X::SparseMatrixCSC) = log10.(max.(1,vec(sum(X;dims=1))))
+
+
+# Assumes each column is a cell
+function logcellcounts(X::SparseMatrixCSC, feature_mask)
+	P,N = size(X)
+	@assert length(feature_mask)==P
+	out = zeros(N)
+
+	# TODO: Can we make this a bit faster? If it wasn't for the feature_mask, it would just be sum(X;dims=1)
+	R = rowvals(X)
+	V = nonzeros(X)
+	for j=1:N
+		s = sum(nzrange(X,j); init=0) do k
+			V[k]*feature_mask[R[k]]
+		end
+
+		out[j] = log10(max(1,s))
+	end
+
+	out
+end
+
 
 # Assumes each column is a gene
 # function loggenemean(X::SparseMatrixCSC)
@@ -101,7 +123,7 @@ function scparams_estimate(::Type{T}, X::AbstractSparseMatrix{Tv,Ti};
 	nthreads = max(nthreads,1)
 	P,N = size(X)
 
-	logCellCounts=logcellcounts(X)
+	logCellCounts=logcellcounts(X, feature_mask)
 	logGeneMean=loggenemean(X)
 
 	@assert method in (:poisson, :nb) "Method must be :poisson or :nb"
@@ -356,7 +378,7 @@ See also: [`sctransform`](@ref)
 """
 function scparams(::Type{T}, X::AbstractSparseMatrix, features;
                   method=:poisson,
-                  min_cells::Integer=5,
+                  min_cells::Int=5,
                   feature_type = hasproperty(features, :feature_type) ? "Gene Expression" : nothing,
                   feature_mask = feature_type !== nothing ? features.feature_type.==feature_type : trues(size(X,1)),
                   feature_names = hasproperty(features,:name) ? features.name : features.id,
@@ -367,6 +389,7 @@ function scparams(::Type{T}, X::AbstractSparseMatrix, features;
                   kwargs...) where T
 	P,N = size(X)
 	length(feature_names) == P || throw(DimensionMismatch("The number of rows in the count matrix and the number of features do not match."))
+	feature_mask = convert(BitVector, feature_mask)
 
 	if cache_read || cache_write
 		h = _scparams_checksum(X,method,min_cells,feature_mask)

diff --git a/src/transform.jl b/src/transform.jl
@@ -1,16 +1,20 @@
 """
 	sctransform(X::AbstractSparseMatrix, features, params;
-                transpose = false,
-                feature_id_columns = [:id,:feature_type],
-                cell_ind = 1:size(X,2),
-                clip=sqrt(size(X,2)/30))
+	            transpose = false,
+	            feature_id_columns = [:id,:feature_type],
+	            feature_type,
+	            feature_mask,
+	            cell_ind = 1:size(X,2),
+	            clip=sqrt(size(X,2)/30))
 
 Computes the SCTransform of the sparse matrix `X`, with features as rows and cells as columns.
 `features` should be a table (e.g. DataFrame or NamedTuple) with feature annotations.
 `params` is a table with reguralized feature parameter estimates - typically estimated using the function `scparams`.
 
 * `transpose` - set to true to transpose the output (cells as rows, features as columns).
 * `feature_id_columns` is a vector of column names in `features`. The rows in `features` must be unique based on these columns.
+* `feature_type` - Convenience parameter used for `feature_mask` default. Defaults to "Gene Expression" if `features` has a `feature_type` column.
+* `feature_mask` - Vector of booleans deciding which features to use. If set explicitly, the `feature_type` parameter is ignored, otherwise `feature_mask` defaults to only choosing the `feature_type` specified. Affects how logcellcounts are computed. Normally expected to match `feature_mask` used when calling `scparams`.
 * `cell_ind` is vector of cell indices to include in the output. This is computationally more efficient than subsetting afterwards, but yields the same result.
 * `clip` - values less than `-clip` or larger than `clip` are clamped, to reduce the impact of outliers.
 
@@ -19,11 +23,15 @@ See also: [`scparams`](@ref)
 function sctransform(X::AbstractSparseMatrix, features, params;
                      transpose = false,
                      feature_id_columns = [:id,:feature_type],
+                     feature_type = hasproperty(features, :feature_type) ? "Gene Expression" : nothing,
+                     feature_mask = feature_type !== nothing ? features.feature_type.==feature_type : trues(size(X,1)),
                      cell_ind = 1:size(X,2),
                      clip=sqrt(size(X,2)/30))
 
 	@assert size(X,1)==length(getproperty(features,first(propertynames(features)))) "The number of rows in X and features must match"
 
+	feature_mask = convert(BitVector, feature_mask)
+
 	β0 = params.beta0
 	β1 = params.beta1
 	θ  = params.theta
@@ -40,8 +48,7 @@ function sctransform(X::AbstractSparseMatrix, features, params;
 	any(isnothing, feature_ind) && throw(DomainError("Feature ids in `params` does not match ids in `features`."))
 
 
-
-	logCellCounts = logcellcounts(X)[cell_ind]
+	logCellCounts = logcellcounts(X, feature_mask)[cell_ind]
 
 	# TODO: Do not create intermediate X[feature_ind,cell_ind]
 	X = X[feature_ind,cell_ind]

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -2,6 +2,9 @@ using Test
 using SCTransform
 using SCTransform: scparams_estimate, scparams_detect_outliers, scparams_bandwidth, scparams_regularize
 using SingleCell10x
+using SparseArrays
+using Random
+using StableRNGs
 
 using DelimitedFiles
 using DataFrames
@@ -21,20 +24,39 @@ end
 maxnorm(x) = maximum(abs, x)
 
 
+simple_logcellcounts(X::SparseMatrixCSC) = log10.(max.(1,vec(sum(X;dims=1))))
+
+
 
 @testset "SCTransform.jl" begin
+	@testset "logcellcounts" begin
+		X = sprand(StableRNG(7833), 50, 40, 0.1, (rng,k)->rand(rng,1:1000,k))
+		@test SCTransform.logcellcounts(X,trues(50)) == simple_logcellcounts(X)
+
+		fm = bitrand(StableRNG(9864), 50)
+		@test SCTransform.logcellcounts(X,fm) == simple_logcellcounts(X[fm,:])
+
+		X .*= bitrand(StableRNG(5493), 1, 40)
+		dropzeros!(X)
+		@test SCTransform.logcellcounts(X,fm) == simple_logcellcounts(X[fm,:])
+	end
+
+
 	@testset "500_PBMC_50genes" begin
-		filename = "data/500_PBMC_50genes/filtered_feature_bc_matrix.h5"
+		data_path = joinpath(pkgdir(SCTransform), "test/data")
+
+
+		filename = joinpath(data_path, "500_PBMC_50genes/filtered_feature_bc_matrix.h5")
 		X,f,b = read10x(filename)
 
-		pnonreg_ans = read_model_pars("data/500_PBMC_50genes/model_pars.csv")
-		gm_ans = readdlm("data/500_PBMC_50genes/genemean.csv", Float64)
+		pnonreg_ans = read_model_pars(joinpath(data_path, "500_PBMC_50genes/model_pars.csv"))
+		gm_ans = readdlm(joinpath(data_path, "500_PBMC_50genes/genemean.csv"), Float64)
 
-		pnonreg_ans2 = read_model_pars("data/500_PBMC_50genes/model_pars2.csv")
+		pnonreg_ans2 = read_model_pars(joinpath(data_path, "500_PBMC_50genes/model_pars2.csv"))
 		bw_ans2 = 0.243867597143564
-		preg_ans2 = read_model_pars("data/500_PBMC_50genes/model_pars_fit2.csv")
-		gm_ans2 = readdlm("data/500_PBMC_50genes/genemean2.csv", Float64)
-		Z_ans2 = readdlm("data/500_PBMC_50genes/transformed2_every_10th_cell.csv", ',', Float64)
+		preg_ans2 = read_model_pars(joinpath(data_path, "500_PBMC_50genes/model_pars_fit2.csv"))
+		gm_ans2 = readdlm(joinpath(data_path, "500_PBMC_50genes/genemean2.csv"), Float64)
+		Z_ans2 = readdlm(joinpath(data_path, "500_PBMC_50genes/transformed2_every_10th_cell.csv"), ',', Float64)
 
 
 
@@ -148,6 +170,33 @@ maxnorm(x) = maximum(abs, x)
 			@test_throws DomainError sctransform(X2, f2, preg3)
 		end
 
+
+		@testset "feature_mask" begin
+			f_ind = setdiff(1:50, [2,18,33,35,50]) # remove some variables that are excluded by scparams - i.e. they will only affect logcellcounts!
+
+			# Ground truth by actually removing the variables
+			X3 = X[f_ind,:]
+			f3 = SCTransform.subset_rows(f, f_ind) # TODO: avoid using internal function
+			preg3 = scparams(X3, f3; use_cache=false)
+			Z3 = sctransform(X3, f3, preg3)
+
+			# Test by only excluding the varaibles using feature_mask
+			feature_mask = falses(50)
+			feature_mask[f_ind] .= true
+			preg_fm = scparams(X, f; feature_mask, use_cache=false)
+			Z_fm = sctransform(X, f, preg_fm; feature_mask)
+
+			@test preg_fm == preg3
+			@test Z_fm ≈ Z3
+
+
+			# Sanity check that it's different from including all genes - i.e. using the feature_mask and thus changing logcellcount makes a difference
+			preg = scparams(X, f; use_cache=false)
+			@test preg != preg_fm
+			Z = sctransform(X, f, preg)
+			@test !(Z_fm ≈ Z)
+		end
+
 	end
 end