Skip to content

Commit

Permalink
Use feature_mask for logcellcounts (#6)
Browse files Browse the repository at this point in the history
* BREAKING: scparams now uses feature_mask to compute logcellcounts
* logcellcounts now uses feature_mask
* sctransform now takes feature_mask parameter
* Use StableHashTraits for hashing instead of custom solution
* Bump internal hash version due to scparam changes
* Add CHANGELOG
  • Loading branch information
rasmushenningsson authored Jun 18, 2024
1 parent d1d2796 commit 14cd387
Show file tree
Hide file tree
Showing 7 changed files with 122 additions and 68 deletions.
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# SCTransform.jl changelog

All notable changes to this project will be documented in this file.

The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]

## [0.2.0] - 2024-06-18

### Breaking

* `scparams` now uses `feature_mask` when computing `logcellcounts`. This is important if there are multiple modalities in the data (e.g. Gene Expression and Antibody counts.)
* `sctransform` now takes `feature_mask` parameter, which controls which features are used to compute `logcellcounts`. Defaults to only using "Gene Expression" features.
* Caching of `scparams` results now uses `StableHashTraits` for hashing.
8 changes: 6 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "SCTransform"
uuid = "f56ea72b-d1ea-403e-9c0c-95d94e8a8532"
authors = ["Rasmus Henningsson <[email protected]>"]
version = "0.1.3"
version = "0.2.0"

[deps]
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
Expand All @@ -14,6 +14,7 @@ SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
Scratch = "6c6a2e73-6563-6170-7368-637461726353"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
StableHashTraits = "c5dd0088-6c3f-4803-b00e-f31a60c170fa"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[compat]
Expand All @@ -28,13 +29,16 @@ SHA = "0.7"
Scratch = "1.1"
SparseArrays = "1"
SpecialFunctions = "0.10, 1, 2"
StableHashTraits = "1.2.0"
Statistics = "1"
julia = "1.8"

[extras]
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SingleCell10x = "eddce310-d14b-4c2d-aa06-cfe9e2f7af98"
StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test", "SingleCell10x", "DataFrames"]
test = ["Test", "SingleCell10x", "DataFrames", "Random", "StableRNGs"]
1 change: 1 addition & 0 deletions src/SCTransform.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ using Scratch
using SHA
using DelimitedFiles
using CodecZlib
using StableHashTraits

include("utils.jl")
include("table.jl")
Expand Down
54 changes: 4 additions & 50 deletions src/cache.jl
Original file line number Diff line number Diff line change
@@ -1,57 +1,11 @@
# Here is a 64 bit hash implementation that
# * Is stable across versions (unless we choose to change it ourselves)
# * Takes all elements of a vector into account
# * Is reasonable fast (otherwise we would've used sha256)

function _hash(n::UInt64) # copied from Julia Base hash_64_64 (MIT license)
a::UInt64 = n
a = ~a + a << 21
a = a a >> 24
a = a + a << 3 + a << 8
a = a a >> 14
a = a + a << 2 + a << 4
a = a a >> 28
a = a + a << 31
return a
end
_hash(n::UInt32) = _hash(UInt64(n))
_hash(n::Union{Bool,UInt32}) = _hash(UInt64(n))

_hash(n,h::UInt64) = _hash(n) - 3h

function _fullhash_impl(v, h=UInt(0))
for x in v
h = _hash(x,h)
end
h
end

_fullhash(v::AbstractVector{Float64}, h=UInt64(0)) = _fullhash_impl(reinterpret(UInt64,v), h)
_fullhash(v::AbstractVector{Int64}, h=UInt64(0)) = _fullhash_impl(reinterpret(UInt64,v), h)
_fullhash(v::AbstractVector{Float32}, h=UInt64(0)) = _fullhash_impl(reinterpret(UInt32,v), h)
_fullhash(v::AbstractVector{Int32}, h=UInt64(0)) = _fullhash_impl(reinterpret(UInt32,v), h)
_fullhash(v, h=UInt64(0)) = _fullhash_impl(v, h)



const SCPARAMS_VERSION = v"0.1.0"
const SCPARAMS_VERSION = v"0.2.0"

function _scparams_checksum(X::SparseMatrixCSC,method,min_cells,feature_mask)
function _scparams_checksum(X::SparseMatrixCSC, method::Symbol, min_cells::Int, feature_mask::BitVector)
@assert method in (:poisson, :nb) "Method must be :poisson or :nb"
P,N = size(X)

h = String[]
push!(h, string(P))
push!(h, string(N))
push!(h, string(_fullhash(X.colptr)))
push!(h, string(_fullhash(X.rowval)))
push!(h, string(_fullhash(X.nzval)))
push!(h, string(method))
push!(h, string(min_cells))
push!(h, string(_fullhash(feature_mask)))
push!(h, string(SCPARAMS_VERSION))

h = bytes2hex(sha256(join(h,"__<sep>__")))
tup = (P, N, X.colptr, X.rowval, X.nzval, method, min_cells, feature_mask, SCPARAMS_VERSION)
bytes2hex(stable_hash(tup; version=3))
end

function _scparams_cache_convert_column(data, type)
Expand Down
29 changes: 26 additions & 3 deletions src/params.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,29 @@
# logcellcounts(X::SparseMatrixCSC) = log10.(max.(1,vec(sum(X;dims=2))))

# Assumes each column is a cell
logcellcounts(X::SparseMatrixCSC) = log10.(max.(1,vec(sum(X;dims=1))))
# logcellcounts(X::SparseMatrixCSC) = log10.(max.(1,vec(sum(X;dims=1))))


# Assumes each column is a cell
function logcellcounts(X::SparseMatrixCSC, feature_mask)
P,N = size(X)
@assert length(feature_mask)==P
out = zeros(N)

# TODO: Can we make this a bit faster? If it wasn't for the feature_mask, it would just be sum(X;dims=1)
R = rowvals(X)
V = nonzeros(X)
for j=1:N
s = sum(nzrange(X,j); init=0) do k
V[k]*feature_mask[R[k]]
end

out[j] = log10(max(1,s))
end

out
end


# Assumes each column is a gene
# function loggenemean(X::SparseMatrixCSC)
Expand Down Expand Up @@ -101,7 +123,7 @@ function scparams_estimate(::Type{T}, X::AbstractSparseMatrix{Tv,Ti};
nthreads = max(nthreads,1)
P,N = size(X)

logCellCounts=logcellcounts(X)
logCellCounts=logcellcounts(X, feature_mask)
logGeneMean=loggenemean(X)

@assert method in (:poisson, :nb) "Method must be :poisson or :nb"
Expand Down Expand Up @@ -356,7 +378,7 @@ See also: [`sctransform`](@ref)
"""
function scparams(::Type{T}, X::AbstractSparseMatrix, features;
method=:poisson,
min_cells::Integer=5,
min_cells::Int=5,
feature_type = hasproperty(features, :feature_type) ? "Gene Expression" : nothing,
feature_mask = feature_type !== nothing ? features.feature_type.==feature_type : trues(size(X,1)),
feature_names = hasproperty(features,:name) ? features.name : features.id,
Expand All @@ -367,6 +389,7 @@ function scparams(::Type{T}, X::AbstractSparseMatrix, features;
kwargs...) where T
P,N = size(X)
length(feature_names) == P || throw(DimensionMismatch("The number of rows in the count matrix and the number of features do not match."))
feature_mask = convert(BitVector, feature_mask)

if cache_read || cache_write
h = _scparams_checksum(X,method,min_cells,feature_mask)
Expand Down
19 changes: 13 additions & 6 deletions src/transform.jl
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
"""
sctransform(X::AbstractSparseMatrix, features, params;
transpose = false,
feature_id_columns = [:id,:feature_type],
cell_ind = 1:size(X,2),
clip=sqrt(size(X,2)/30))
transpose = false,
feature_id_columns = [:id,:feature_type],
feature_type,
feature_mask,
cell_ind = 1:size(X,2),
clip=sqrt(size(X,2)/30))
Computes the SCTransform of the sparse matrix `X`, with features as rows and cells as columns.
`features` should be a table (e.g. DataFrame or NamedTuple) with feature annotations.
`params` is a table with reguralized feature parameter estimates - typically estimated using the function `scparams`.
* `transpose` - set to true to transpose the output (cells as rows, features as columns).
* `feature_id_columns` is a vector of column names in `features`. The rows in `features` must be unique based on these columns.
* `feature_type` - Convenience parameter used for `feature_mask` default. Defaults to "Gene Expression" if `features` has a `feature_type` column.
* `feature_mask` - Vector of booleans deciding which features to use. If set explicitly, the `feature_type` parameter is ignored, otherwise `feature_mask` defaults to only choosing the `feature_type` specified. Affects how logcellcounts are computed. Normally expected to match `feature_mask` used when calling `scparams`.
* `cell_ind` is vector of cell indices to include in the output. This is computationally more efficient than subsetting afterwards, but yields the same result.
* `clip` - values less than `-clip` or larger than `clip` are clamped, to reduce the impact of outliers.
Expand All @@ -19,11 +23,15 @@ See also: [`scparams`](@ref)
function sctransform(X::AbstractSparseMatrix, features, params;
transpose = false,
feature_id_columns = [:id,:feature_type],
feature_type = hasproperty(features, :feature_type) ? "Gene Expression" : nothing,
feature_mask = feature_type !== nothing ? features.feature_type.==feature_type : trues(size(X,1)),
cell_ind = 1:size(X,2),
clip=sqrt(size(X,2)/30))

@assert size(X,1)==length(getproperty(features,first(propertynames(features)))) "The number of rows in X and features must match"

feature_mask = convert(BitVector, feature_mask)

β0 = params.beta0
β1 = params.beta1
θ = params.theta
Expand All @@ -40,8 +48,7 @@ function sctransform(X::AbstractSparseMatrix, features, params;
any(isnothing, feature_ind) && throw(DomainError("Feature ids in `params` does not match ids in `features`."))



logCellCounts = logcellcounts(X)[cell_ind]
logCellCounts = logcellcounts(X, feature_mask)[cell_ind]

# TODO: Do not create intermediate X[feature_ind,cell_ind]
X = X[feature_ind,cell_ind]
Expand Down
63 changes: 56 additions & 7 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@ using Test
using SCTransform
using SCTransform: scparams_estimate, scparams_detect_outliers, scparams_bandwidth, scparams_regularize
using SingleCell10x
using SparseArrays
using Random
using StableRNGs

using DelimitedFiles
using DataFrames
Expand All @@ -21,20 +24,39 @@ end
maxnorm(x) = maximum(abs, x)


simple_logcellcounts(X::SparseMatrixCSC) = log10.(max.(1,vec(sum(X;dims=1))))



@testset "SCTransform.jl" begin
@testset "logcellcounts" begin
X = sprand(StableRNG(7833), 50, 40, 0.1, (rng,k)->rand(rng,1:1000,k))
@test SCTransform.logcellcounts(X,trues(50)) == simple_logcellcounts(X)

fm = bitrand(StableRNG(9864), 50)
@test SCTransform.logcellcounts(X,fm) == simple_logcellcounts(X[fm,:])

X .*= bitrand(StableRNG(5493), 1, 40)
dropzeros!(X)
@test SCTransform.logcellcounts(X,fm) == simple_logcellcounts(X[fm,:])
end


@testset "500_PBMC_50genes" begin
filename = "data/500_PBMC_50genes/filtered_feature_bc_matrix.h5"
data_path = joinpath(pkgdir(SCTransform), "test/data")


filename = joinpath(data_path, "500_PBMC_50genes/filtered_feature_bc_matrix.h5")
X,f,b = read10x(filename)

pnonreg_ans = read_model_pars("data/500_PBMC_50genes/model_pars.csv")
gm_ans = readdlm("data/500_PBMC_50genes/genemean.csv", Float64)
pnonreg_ans = read_model_pars(joinpath(data_path, "500_PBMC_50genes/model_pars.csv"))
gm_ans = readdlm(joinpath(data_path, "500_PBMC_50genes/genemean.csv"), Float64)

pnonreg_ans2 = read_model_pars("data/500_PBMC_50genes/model_pars2.csv")
pnonreg_ans2 = read_model_pars(joinpath(data_path, "500_PBMC_50genes/model_pars2.csv"))
bw_ans2 = 0.243867597143564
preg_ans2 = read_model_pars("data/500_PBMC_50genes/model_pars_fit2.csv")
gm_ans2 = readdlm("data/500_PBMC_50genes/genemean2.csv", Float64)
Z_ans2 = readdlm("data/500_PBMC_50genes/transformed2_every_10th_cell.csv", ',', Float64)
preg_ans2 = read_model_pars(joinpath(data_path, "500_PBMC_50genes/model_pars_fit2.csv"))
gm_ans2 = readdlm(joinpath(data_path, "500_PBMC_50genes/genemean2.csv"), Float64)
Z_ans2 = readdlm(joinpath(data_path, "500_PBMC_50genes/transformed2_every_10th_cell.csv"), ',', Float64)



Expand Down Expand Up @@ -148,6 +170,33 @@ maxnorm(x) = maximum(abs, x)
@test_throws DomainError sctransform(X2, f2, preg3)
end


@testset "feature_mask" begin
f_ind = setdiff(1:50, [2,18,33,35,50]) # remove some variables that are excluded by scparams - i.e. they will only affect logcellcounts!

# Ground truth by actually removing the variables
X3 = X[f_ind,:]
f3 = SCTransform.subset_rows(f, f_ind) # TODO: avoid using internal function
preg3 = scparams(X3, f3; use_cache=false)
Z3 = sctransform(X3, f3, preg3)

# Test by only excluding the varaibles using feature_mask
feature_mask = falses(50)
feature_mask[f_ind] .= true
preg_fm = scparams(X, f; feature_mask, use_cache=false)
Z_fm = sctransform(X, f, preg_fm; feature_mask)

@test preg_fm == preg3
@test Z_fm Z3


# Sanity check that it's different from including all genes - i.e. using the feature_mask and thus changing logcellcount makes a difference
preg = scparams(X, f; use_cache=false)
@test preg != preg_fm
Z = sctransform(X, f, preg)
@test !(Z_fm Z)
end

end
end

2 comments on commit 14cd387

@rasmushenningsson
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator register

Release notes:

Breaking change. scparams and sctransform now uses feature_mask when computing logcellcounts. This makes handling of datasets with multiple modalities (e.g. RNA and antibody counts) easier.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/109235

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v0.2.0 -m "<description of version>" 14cd387868986375c29e9fdf3941cac340de047b
git push origin v0.2.0

Please sign in to comment.