Skip to content

Commit

Permalink
ITS GPU: Make threads and blocks configurable from CLI (#13596)
Browse files Browse the repository at this point in the history
  • Loading branch information
mconcas authored Oct 15, 2024
1 parent 415a7b5 commit 34eb6f4
Show file tree
Hide file tree
Showing 11 changed files with 53 additions and 64 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,6 @@ class TimeFrameGPU : public TimeFrame
void registerHostMemory(const int);
void unregisterHostMemory(const int);
void initialise(const int, const TrackingParameters&, const int, IndexTableUtils* utils = nullptr, const TimeFrameGPUParameters* pars = nullptr);
void initialiseHybrid(const int, const TrackingParameters&, const int, IndexTableUtils* utils = nullptr, const TimeFrameGPUParameters* pars = nullptr);
void initDevice(IndexTableUtils*, const TrackingParameters& trkParam, const TimeFrameGPUParameters&, const int, const int);
void initDeviceSAFitting();
void loadTrackingFrameInfoDevice(const int);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ void trackSeedHandler(CellSeed* trackSeeds,
float maxChi2ClusterAttachment,
float maxChi2NDF,
const o2::base::Propagator* propagator,
const o2::base::PropagatorF::MatCorrType matCorrType);
const o2::base::PropagatorF::MatCorrType matCorrType,
const int nBlocks,
const int nThreads);
} // namespace o2::its
#endif // ITSTRACKINGGPU_TRACKINGKERNELS_H_
21 changes: 0 additions & 21 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -362,27 +362,6 @@ void TimeFrameGPU<nLayers>::initialise(const int iteration,
const int maxLayers,
IndexTableUtils* utils,
const TimeFrameGPUParameters* gpuParam)
{
mGpuStreams.resize(mGpuParams.nTimeFrameChunks);
mHostNTracklets.resize((nLayers - 1) * mGpuParams.nTimeFrameChunks, 0);
mHostNCells.resize((nLayers - 2) * mGpuParams.nTimeFrameChunks, 0);

auto init = [&]() -> void {
this->initDevice(utils, trkParam, *gpuParam, maxLayers, iteration);
};
std::thread t1{init};
RANGE("tf_cpu_initialisation", 1);
o2::its::TimeFrame::initialise(iteration, trkParam, maxLayers);
// registerHostMemory(maxLayers);
t1.join();
}

template <int nLayers>
void TimeFrameGPU<nLayers>::initialiseHybrid(const int iteration,
const TrackingParameters& trkParam,
const int maxLayers,
IndexTableUtils* utils,
const TimeFrameGPUParameters* gpuParam)
{
mGpuStreams.resize(mGpuParams.nTimeFrameChunks);
o2::its::TimeFrame::initialise(iteration, trkParam, maxLayers);
Expand Down
9 changes: 6 additions & 3 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include "ITStrackingGPU/TrackerTraitsGPU.h"
#include "ITStrackingGPU/TrackingKernels.h"
#include "ITStracking/TrackingConfigParam.h"

namespace o2::its
{
Expand All @@ -28,7 +29,7 @@ constexpr int UnusedIndex{-1};
template <int nLayers>
void TrackerTraitsGPU<nLayers>::initialiseTimeFrame(const int iteration)
{
mTimeFrameGPU->initialiseHybrid(iteration, mTrkParams[iteration], nLayers);
mTimeFrameGPU->initialise(iteration, mTrkParams[iteration], nLayers);
mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration);
}

Expand Down Expand Up @@ -397,7 +398,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
}
mTimeFrameGPU->createTrackITSExtDevice(trackSeeds);
mTimeFrameGPU->loadTrackSeedsDevice(trackSeeds);

auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
trackSeedHandler(
mTimeFrameGPU->getDeviceTrackSeeds(), // CellSeed* trackSeeds,
mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), // TrackingFrameInfo** foundTrackingFrameInfo,
Expand All @@ -408,7 +409,9 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
mTrkParams[0].MaxChi2ClusterAttachment, // float maxChi2ClusterAttachment,
mTrkParams[0].MaxChi2NDF, // float maxChi2NDF,
mTimeFrameGPU->getDevicePropagator(), // const o2::base::Propagator* propagator
mCorrType); // o2::base::PropagatorImpl<float>::MatCorrType
mCorrType, // o2::base::PropagatorImpl<float>::MatCorrType
conf.nBlocks,
conf.nThreads);

mTimeFrameGPU->downloadTrackITSExtDevice(trackSeeds);

Expand Down
8 changes: 5 additions & 3 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -717,9 +717,11 @@ void trackSeedHandler(CellSeed* trackSeeds,
float maxChi2ClusterAttachment,
float maxChi2NDF,
const o2::base::Propagator* propagator,
const o2::base::PropagatorF::MatCorrType matCorrType)
const o2::base::PropagatorF::MatCorrType matCorrType,
const int nBlocks,
const int nThreads)
{
gpu::fitTrackSeedsKernel<<<20, 256>>>(
gpu::fitTrackSeedsKernel<<<nBlocks, nThreads>>>(
trackSeeds, // CellSeed* trackSeeds,
foundTrackingFrameInfo, // TrackingFrameInfo** foundTrackingFrameInfo,
tracks, // o2::its::TrackITSExt* tracks,
Expand All @@ -734,4 +736,4 @@ void trackSeedHandler(CellSeed* trackSeeds,
gpuCheckError(cudaPeekAtLastError());
gpuCheckError(cudaDeviceSynchronize());
}
} // namespace o2::its
} // namespace o2::its
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ void VertexerTraitsGPU::initialise(const TrackingParameters& trackingParams, con
{
mTimeFrameGPU->initialise(0, trackingParams, 3, &mIndexTableUtils, &mTfGPUParams);
}

void VertexerTraitsGPU::updateVertexingParameters(const std::vector<VertexingParameters>& vrtPar, const TimeFrameGPUParameters& tfPar)
{
mVrtParams = vrtPar;
Expand Down
41 changes: 28 additions & 13 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/VertexingKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,23 @@ void trackletFinderHandler(const Cluster* clustersNextLayer, // 0 2
const unsigned int startRofId,
const unsigned int rofSize,
const float phiCut,
const size_t maxTrackletsPerCluster)
const unsigned int maxTrackletsPerCluster,
const int nBlocks,
const int nThreads)
{
gpu::trackleterKernelMultipleRof<Mode><<<nBlocks, nThreads>>>(
clustersNextLayer, // const Cluster* clustersNextLayer, // 0 2
clustersCurrentLayer, // const Cluster* clustersCurrentLayer, // 1 1
sizeNextLClusters, // const int* sizeNextLClusters,
sizeCurrentLClusters, // const int* sizeCurrentLClusters,
nextIndexTables, // const int* nextIndexTables,
Tracklets, // Tracklet* Tracklets,
foundTracklets, // int* foundTracklets,
utils, // const IndexTableUtils* utils,
startRofId, // const unsigned int startRofId,
rofSize, // const unsigned int rofSize,
phiCut, // const float phiCut,
maxTrackletsPerCluster); // const unsigned int maxTrackletsPerCluster = 1e2
}
/*
GPUd() float smallestAngleDifference(float a, float b)
Expand Down Expand Up @@ -96,7 +111,7 @@ GPUd() void printOnBlock(const unsigned int bId, const char* str, Args... args)
}
}
GPUg() void printBufferOnThread(const int* v, size_t size, const int len = 150, const unsigned int tId = 0)
GPUg() void printBufferOnThread(const int* v, unsigned int size, const int len = 150, const unsigned int tId = 0)
{
if (blockIdx.x * blockDim.x + threadIdx.x == tId) {
for (int i{0}; i < size; ++i) {
Expand All @@ -109,7 +124,7 @@ GPUg() void printBufferOnThread(const int* v, size_t size, const int len = 150,
}
}
GPUg() void printBufferOnThreadF(const float* v, size_t size, const unsigned int tId = 0)
GPUg() void printBufferOnThreadF(const float* v, unsigned int size, const unsigned int tId = 0)
{
if (blockIdx.x * blockDim.x + threadIdx.x == tId) {
printf("vector :");
Expand All @@ -127,7 +142,7 @@ GPUg() void resetTrackletsKernel(Tracklet* tracklets, const int nTracklets)
}
}
GPUg() void dumpFoundTrackletsKernel(const Tracklet* tracklets, const int* nTracklet, const size_t nClustersMiddleLayer, const int maxTrackletsPerCluster)
GPUg() void dumpFoundTrackletsKernel(const Tracklet* tracklets, const int* nTracklet, const unsigned int nClustersMiddleLayer, const int maxTrackletsPerCluster)
{
for (int iCurrentLayerClusterIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentLayerClusterIndex < nClustersMiddleLayer; iCurrentLayerClusterIndex += blockDim.x * gridDim.x) {
const int stride{iCurrentLayerClusterIndex * maxTrackletsPerCluster};
Expand Down Expand Up @@ -160,15 +175,15 @@ GPUg() void trackleterKernelSingleRof(
int* foundTracklets,
const IndexTableUtils* utils,
const short rofId,
const size_t maxTrackletsPerCluster = 1e2)
const unsigned int maxTrackletsPerCluster = 1e2)
{
const int phiBins{utils->getNphiBins()};
const int zBins{utils->getNzBins()};
// loop on layer1 clusters
for (int iCurrentLayerClusterIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentLayerClusterIndex < sizeCurrentLClusters; iCurrentLayerClusterIndex += blockDim.x * gridDim.x) {
if (iCurrentLayerClusterIndex < sizeCurrentLClusters) {
unsigned int storedTracklets{0};
const size_t stride{iCurrentLayerClusterIndex * maxTrackletsPerCluster};
const unsigned int stride{iCurrentLayerClusterIndex * maxTrackletsPerCluster};
const Cluster& currentCluster = clustersCurrentLayer[iCurrentLayerClusterIndex];
const int4 selectedBinsRect{VertexerTraits::getBinsRect(currentCluster, (int)Mode, 0.f, 50.f, phiCut / 2, *utils)};
if (selectedBinsRect.x != 0 || selectedBinsRect.y != 0 || selectedBinsRect.z != 0 || selectedBinsRect.w != 0) {
Expand Down Expand Up @@ -218,7 +233,7 @@ GPUg() void trackleterKernelMultipleRof(
const short startRofId,
const short rofSize,
const float phiCut,
const size_t maxTrackletsPerCluster = 1e2)
const unsigned int maxTrackletsPerCluster = 1e2)
{
const int phiBins{utils->getNphiBins()};
const int zBins{utils->getNzBins()};
Expand All @@ -235,7 +250,7 @@ GPUg() void trackleterKernelMultipleRof(
// single rof loop on layer1 clusters
for (int iCurrentLayerClusterIndex = threadIdx.x; iCurrentLayerClusterIndex < nClustersCurrentLayerRof; iCurrentLayerClusterIndex += blockDim.x) {
unsigned int storedTracklets{0};
const size_t stride{iCurrentLayerClusterIndex * maxTrackletsPerCluster};
const unsigned int stride{iCurrentLayerClusterIndex * maxTrackletsPerCluster};
const Cluster& currentCluster = clustersCurrentLayerRof[iCurrentLayerClusterIndex];
const int4 selectedBinsRect{VertexerTraits::getBinsRect(currentCluster, (int)Mode, 0.f, 50.f, phiCut / 2, *utils)};
if (selectedBinsRect.x != 0 || selectedBinsRect.y != 0 || selectedBinsRect.z != 0 || selectedBinsRect.w != 0) {
Expand Down Expand Up @@ -276,7 +291,7 @@ template <bool initRun>
GPUg() void trackletSelectionKernelSingleRof(
const Cluster* clusters0,
const Cluster* clusters1,
const size_t nClustersMiddleLayer,
const unsigned int nClustersMiddleLayer,
Tracklet* tracklets01,
Tracklet* tracklets12,
const int* nFoundTracklet01,
Expand Down Expand Up @@ -436,7 +451,7 @@ GPUg() void computeCentroidsKernel(
Line* lines,
int* nFoundLines,
int* nExclusiveFoundLines,
const size_t nClustersMiddleLayer,
const unsigned int nClustersMiddleLayer,
float* centroids,
const float lowHistX,
const float highHistX,
Expand All @@ -446,7 +461,7 @@ GPUg() void computeCentroidsKernel(
{
const int nLines = nExclusiveFoundLines[nClustersMiddleLayer - 1] + nFoundLines[nClustersMiddleLayer - 1];
const int maxIterations{nLines * (nLines - 1) / 2};
for (size_t currentThreadIndex = blockIdx.x * blockDim.x + threadIdx.x; currentThreadIndex < maxIterations; currentThreadIndex += blockDim.x * gridDim.x) {
for (unsigned int currentThreadIndex = blockIdx.x * blockDim.x + threadIdx.x; currentThreadIndex < maxIterations; currentThreadIndex += blockDim.x * gridDim.x) {
int iFirstLine = currentThreadIndex / nLines;
int iSecondLine = currentThreadIndex % nLines;
// All unique pairs
Expand Down Expand Up @@ -496,7 +511,7 @@ GPUg() void computeZCentroidsKernel(
const int binOpeningX,
const int binOpeningY)
{
for (size_t currentThreadIndex = blockIdx.x * blockDim.x + threadIdx.x; currentThreadIndex < nLines; currentThreadIndex += blockDim.x * gridDim.x) {
for (unsigned int currentThreadIndex = blockIdx.x * blockDim.x + threadIdx.x; currentThreadIndex < nLines; currentThreadIndex += blockDim.x * gridDim.x) {
if (tmpVtX[0].value || tmpVtX[1].value) {
float tmpX{lowHistX + tmpVtX[0].key * binSizeHistX + binSizeHistX / 2};
int sumWX{tmpVtX[0].value};
Expand Down Expand Up @@ -543,7 +558,7 @@ GPUg() void computeVertexKernel(
const int minContributors,
const int binOpeningZ)
{
for (size_t currentThreadIndex = blockIdx.x * blockDim.x + threadIdx.x; currentThreadIndex < binOpeningZ; currentThreadIndex += blockDim.x * gridDim.x) {
for (unsigned int currentThreadIndex = blockIdx.x * blockDim.x + threadIdx.x; currentThreadIndex < binOpeningZ; currentThreadIndex += blockDim.x * gridDim.x) {
if (currentThreadIndex == 0) {
if (tmpVertexBins[2].value > 1 && (tmpVertexBins[0].value || tmpVertexBins[1].value)) {
float z{lowHistZ + tmpVertexBins[2].key * binSizeHistZ + binSizeHistZ / 2};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@ struct VertexerParamConfig : public o2::conf::ConfigurableParamHelper<VertexerPa
};

struct TrackerParamConfig : public o2::conf::ConfigurableParamHelper<TrackerParamConfig> {

// Use TGeo for mat. budget
bool useMatCorrTGeo = false;
bool useFastMaterial = false;
Expand Down Expand Up @@ -89,24 +88,13 @@ struct TrackerParamConfig : public o2::conf::ConfigurableParamHelper<TrackerPara
O2ParamDef(TrackerParamConfig, "ITSCATrackerParam");
};

struct GpuRecoParamConfig : public o2::conf::ConfigurableParamHelper<GpuRecoParamConfig> {
struct ITSGpuTrackingParamConfig : public o2::conf::ConfigurableParamHelper<ITSGpuTrackingParamConfig> {
// GPU-specific parameters
size_t tmpCUBBufferSize = 1e5; // In average in pp events there are required 4096 bytes
size_t maxTrackletsPerCluster = 1e2;
size_t clustersPerLayerCapacity = 2.5e5;
size_t clustersPerROfCapacity = 1.5e3;
// size_t trackletsCapacity = maxTrackletsPerCluster * clustersPerLayerCapacity;
size_t validatedTrackletsCapacity = 1e5;
size_t cellsLUTsize = validatedTrackletsCapacity;
size_t maxNeighboursSize = 1e4;
size_t neighboursLUTsize = maxNeighboursSize;
size_t maxRoadPerRofSize = 5e2; // pp!
size_t maxLinesCapacity = 1e2;
size_t maxVerticesCapacity = 5e4;
size_t nTimeFramePartitions = 3;
int maxGPUMemoryGB = -1;
unsigned int tmpCUBBufferSize = 1e5; // In average in pp events there are required 4096 bytes
int nBlocks = 20;
int nThreads = 256;

O2ParamDef(GpuRecoParamConfig, "ITSGpuRecoParam");
O2ParamDef(ITSGpuTrackingParamConfig, "ITSGpuTrackingParam");
};

} // namespace its
Expand Down
4 changes: 2 additions & 2 deletions Detectors/ITSMFT/ITS/tracking/src/TrackingConfigParam.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,10 @@ namespace its
{
static auto& sVertexerParamITS = o2::its::VertexerParamConfig::Instance();
static auto& sCATrackerParamITS = o2::its::TrackerParamConfig::Instance();
static auto& sGpuRecoParamITS = o2::its::GpuRecoParamConfig::Instance();
static auto& sGpuRecoParamITS = o2::its::ITSGpuTrackingParamConfig::Instance();

O2ParamImpl(o2::its::VertexerParamConfig);
O2ParamImpl(o2::its::TrackerParamConfig);
O2ParamImpl(o2::its::GpuRecoParamConfig);
O2ParamImpl(o2::its::ITSGpuTrackingParamConfig);
} // namespace its
} // namespace o2
4 changes: 2 additions & 2 deletions Detectors/ITSMFT/ITS/tracking/src/TrackingLinkDef.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
#pragma link C++ class o2::its::TrackerParamConfig + ;
#pragma link C++ class o2::conf::ConfigurableParamHelper < o2::its::TrackerParamConfig> + ;

#pragma link C++ class o2::its::GpuRecoParamConfig + ;
#pragma link C++ class o2::conf::ConfigurableParamHelper < o2::its::GpuRecoParamConfig> + ;
#pragma link C++ class o2::its::ITSGpuTrackingParamConfig + ;
#pragma link C++ class o2::conf::ConfigurableParamHelper < o2::its::ITSGpuTrackingParamConfig> + ;

#endif
2 changes: 1 addition & 1 deletion Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ void Vertexer::getGlobalConfiguration()
{
auto& vc = o2::its::VertexerParamConfig::Instance();
vc.printKeyValues(true, true);
auto& grc = o2::its::GpuRecoParamConfig::Instance();
auto& grc = o2::its::ITSGpuTrackingParamConfig::Instance();

// This is odd: we override only the parameters for the first iteration.
// Variations for the next iterations are set in the trackingInterfrace.
Expand Down

0 comments on commit 34eb6f4

Please sign in to comment.