-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ccall and bittensor performance #7
Comments
Flux uses dot operation to implement Dense output operation. Also, a benchmark test has shown BitDense when compared to Flux Dense, only achieves 10.7x speedup in a scenario it was expected to get about a 32x speedup. In reason of that, I noticed, by system monitor, Julia is using multicore on Dense operation even when I set JULIA_NUM_THREADS to 1. Maybe this performance difference is due to multicore optimization. This thread focuses on finding a better performance alternative since BitDense has no other optimizations, except for gcc -O3 flag. |
First try, Julia-side multicore ccall. It splits the work by dividing the number of neurons by the number of threads and then making a different ccall for each of the parts. Benchmark:
Julia-side function bitDenseBitExecute(l::BitDense, x::BitArray{1})
out = BitArray{1}(undef, size(l.W, 1))
Threads.@threads for i in 1:Threads.nthreads()
rows = size(l.W.chunks, 1)
loopStart = round(Int, (Threads.threadid() - 1) * rows /
Threads.nthreads())
loopEnd = round(Int, Threads.threadid() * rows /
Threads.nthreads())
ccall((:binaryBitOutDotProduct, binarynetlib), Cvoid,
(Ptr{UInt64}, Ptr{UInt64}, Ptr{UInt64}, Ptr{UInt64}, Cint, Cint, Cint, Cint),
x.chunks, l.W.chunks, l.b.chunks, out.chunks, loopStart, loopEnd,
size(l.W, 1), size(l.W, 2))
end
return out
end C-side void binaryBitOutDotProduct(uint64_t* in, uint64_t* weights, uint64_t* bias,
uint64_t* out, int neuronStart, int neuronEnd, int neuronNumber, int wNumber)
{
int wNumberChunks = ceil(wNumber / 64.0);
if(wNumberChunks > 1)
{
int nLim = neuronEnd - neuronStart;
int tempOut[nLim];
int iLim = wNumberChunks - 1;
weights += neuronStart;
bias += neuronStart;
uint64_t firstIn = in[0];
for(int i = 0; i < nLim; i++)
tempOut[i] = __builtin_popcountl(firstIn ^ weights[i]);
weights += neuronNumber;
for(int i = 1; i < iLim; i++)
{
for(int j = 0; j < nLim; j++)
tempOut[j] += __builtin_popcountl(in[i] ^ weights[j]);
weights += neuronNumber;
}
uint64_t lastIn = in[iLim];
for(int i = 0; i < nLim; i++)
{
int b = getBit(bias, i);
tempOut[i] += __builtin_popcountl(lastIn ^ weights[i]);
setBit(out, i, ((tempOut[i] + b) > ((wNumber + !b) - tempOut[i])));
}
}
else
{
for(int i = neuronStart; i < neuronEnd; i++)
{
int b = getBit(bias, i);
int temp = __builtin_popcountl(in[0] ^ weights[i]);
setBit(out, i, ((temp + b) > ((wNumber + !b) - temp)));
}
}
} |
All tests were performed by using Dense layers with 640 input values and 64 neurons
|
Second try, dot operations.
Changes @inline _bitOperation(w::Int, b::Bool, nWeights::Int) =
(w + b) > (!b + nWeights - w)
function bitDenseBitExecute(l::BitDense, x::BitArray{1})
return BitArray(
_bitOperation.(
reshape(
sum(
count_ones.(
(transpose(l.W.chunks) .⊻ x.chunks)
)
, dims=1)
, :)
, l.b, length(x))
)
end |
Regular BitDense benchmark:
|
Third try, openmp parallelization.
Julia-side function bitDenseBitExecute(l::BitDense, x::BitArray{1})
out = BitArray{1}(undef, size(l.W, 1))
ccall((:binaryBitOutDotProduct, binarynetlib), Cvoid,
(Ptr{UInt64}, Ptr{UInt64}, Ptr{UInt64}, Ptr{UInt64}, Cint, Cint, Cint),
x.chunks, l.W.chunks, l.b.chunks, out.chunks,
size(l.W, 1), size(l.W, 2), Threads.nthreads())
return out
end C-side void binaryBitOutDotProduct(uint64_t* in, uint64_t* weights, uint64_t* bias,
uint64_t* out, int neuronNumber, int wNumber, int nthreads)
{
omp_set_num_threads(nthreads);
int wNumberChunks = ceil(wNumber / 64.0);
if(wNumberChunks > 1)
{
int tempOut[neuronNumber];
int iLim = wNumberChunks - 1;
uint64_t *baseWeights = weights;
uint64_t firstIn = in[0];
for(int i = 0; i < neuronNumber; i++)
tempOut[i] = __builtin_popcountl(firstIn ^ weights[i]);
weights += neuronNumber;
#pragma omp parallel for private(weights) firstprivate(tempOut)
for(int i = 1; i < iLim; i++)
{
weights = i * neuronNumber + baseWeights;
for(int j = 0; j < neuronNumber; j++)
tempOut[j] += __builtin_popcountl(in[i] ^ weights[j]);
}
weights = iLim * neuronNumber + baseWeights;
uint64_t lastIn = in[iLim];
for(int i = 0; i < neuronNumber; i++)
{
int b = getBit(bias, i);
tempOut[i] += __builtin_popcountl(lastIn ^ weights[i]);
setBit(out, i, ((tempOut[i] + b) > ((wNumber + !b) - tempOut[i])));
}
}
else
{
for(int i = 0; i < neuronNumber; i++)
{
int b = getBit(bias, i);
int temp = __builtin_popcountl(in[0] ^ weights[i]);
setBit(out, i, ((temp + b) > ((wNumber + !b) - temp)));
}
}
} |
Fourth try, for loop execution:
Changes: function bitDenseBitExecute(l::BitDense, x::BitArray{1})
W, b = l.W.chunks, l.b
ilim, jlim, wcnt = size(W, 1), size(W, 2), length(x)
out = BitArray{1}(undef, ilim)
Wx = x.chunks
for i in 1:ilim
temp = 0
for j in 1:jlim
temp += count_ones(W[i, j] ⊻ Wx[j])
end
@inbounds out[i] = (temp + b[i]) > (!b[i] + wcnt - temp)
end
return out
end |
Fifth try, Julia for loop with matrix transposing:
Changes: function bitDenseBitExecute(l::BitDense, x::BitArray{1})
Wc, b, Xc = l.W.chunks, l.b, x.chunks
neuronNumber, weightsNumber, = size(Wc, 2), size(l.W, 2)
wChunksNumber = ceil(Int, weightsNumber / 64.0)
out = BitArray{1}(undef, length(x))
for i in 1:neuronNumber
temp = 0
for j in 1:wChunksNumber
temp += count_ones(Wc[j,i] ⊻ Xc[j])
end
@inbounds out[i] = (temp + b[i]) > (!b[i] + weightsNumber - temp)
end
return out
end BitTensor.jl function BitTensor{2}(value, rows::Int, cols::Int)
#inverted to be cache friendly when iterating over rows
chunkDims = (ceil(Int, cols / 64), rows)
if value == undef
chunks = rand(UInt64, chunkDims)
elseif value == true
chunks = fill(typemax(UInt64), chunkDims)
else
chunks = zeros(UInt64, chunkDims)
end
if value != false
rest = 64 - (cols % 64)
if rest < 64
for i in 1:rows
chunks[end,i] = (chunks[end,i] << rest) >> rest
end
end
end
len = rows * cols
bmat = new(chunks, len)
bmat.dims = (rows, cols)
return bmat
end
@inline function Base.getindex(mat::BitTensor{2}, i::Int, j::Int)
jMod = (j - 1) % 64
return Bool((mat.chunks[ceil(Int, j / 64), i] & (1 << jMod)) >> jMod)
end
@inline function Base.setindex!(mat::BitTensor{2}, value::Bool, i::Int, j::Int)
jMod = (j - 1) % 64
mat.chunks[ceil(Int, j / 64), i] &= ~(1 << jMod)
mat.chunks[ceil(Int, j / 64), i] |= value << jMod
end |
Performance analysis. Try these methods and report which one was better performed.
If ccall shows itself as being the best performed, find a multiplatform solution for compiling it
The text was updated successfully, but these errors were encountered: