From 9c24fe74eb2349e2f0871b8ce429ef1d6adcc15d Mon Sep 17 00:00:00 2001 From: Kamil Skwarczynski Date: Sat, 8 Feb 2025 11:20:44 +0000 Subject: [PATCH] Minor speedup due to reusing calcuaiton isntead of repeating them --- covariance/covarianceBase.cpp | 4 ++-- splines/SplineMonolith.cpp | 30 +++++++++++++-------------- splines/gpuSplineUtils.cu | 39 ++++++++++------------------------- 3 files changed, 28 insertions(+), 45 deletions(-) diff --git a/covariance/covarianceBase.cpp b/covariance/covarianceBase.cpp index 922c20175..ccac2b5eb 100644 --- a/covariance/covarianceBase.cpp +++ b/covariance/covarianceBase.cpp @@ -667,8 +667,8 @@ double covarianceBase::CalcLikelihood() const _noexcept_ { for (int j = 0; j <= i; ++j) { if (!_fFlatPrior[i] && !_fFlatPrior[j]) { //KS: Since matrix is symmetric we can calculate non diagonal elements only once and multiply by 2, can bring up to factor speed decrease. - short int scale = (i != j) ? 2 : 1; - logL += scale * 0.5*(_fPropVal[i] - _fPreFitValue[i])*(_fPropVal[j] - _fPreFitValue[j])*InvertCovMatrix[i][j]; + double scale = (i != j) ? 1. : 0.5; + logL += scale * (_fPropVal[i] - _fPreFitValue[i])*(_fPropVal[j] - _fPreFitValue[j])*InvertCovMatrix[i][j]; } } } diff --git a/splines/SplineMonolith.cpp b/splines/SplineMonolith.cpp index 4e4a0074f..e41e813ee 100644 --- a/splines/SplineMonolith.cpp +++ b/splines/SplineMonolith.cpp @@ -525,7 +525,7 @@ void SMonolith::LoadSplineFile(std::string FileName) { FileName.insert(0, std::string(std::getenv("MACH3"))+"/"); } - TFile *SplineFile = new TFile(FileName.c_str(), "OPEN"); + auto SplineFile = std::make_unique(FileName.c_str(), "OPEN"); TTree *Settings = SplineFile->Get("Settings"); TTree *Monolith = SplineFile->Get("Monolith"); TTree *Monolith_TF1 = SplineFile->Get("Monolith_TF1"); @@ -655,7 +655,6 @@ void SMonolith::LoadSplineFile(std::string FileName) { } SplineFile->Close(); - delete SplineFile; // Print some info; could probably make this to a separate function PrintInitialsiation(); @@ -672,7 +671,7 @@ void SMonolith::PrepareSplineFile() { FileName.insert(0, std::string(std::getenv("MACH3"))+"/"); } - TFile *SplineFile = new TFile(FileName.c_str(), "recreate"); + auto SplineFile = std::make_unique(FileName.c_str(), "recreate"); TTree *Settings = new TTree("Settings", "Settings"); TTree *Monolith = new TTree("Monolith", "Monolith"); TTree *Monolith_TF1 = new TTree("Monolith_TF1", "Monolith_TF1"); @@ -791,7 +790,6 @@ void SMonolith::PrepareSplineFile() { delete EventInfo; delete FastSplineInfoTree; SplineFile->Close(); - delete SplineFile; } // ***************************************** @@ -929,7 +927,7 @@ void SMonolith::FindSplineSegment() { for (M3::int_t i = 0; i < nParams; ++i) { const M3::int_t nPoints = SplineInfoArray[i].nPts; - const M3::float_t* xArray = SplineInfoArray[i].xPts; + const M3::float_t* _restrict_ xArray = SplineInfoArray[i].xPts; // Get the variation for this reconfigure for the ith parameter const float xvar = float(*SplineInfoArray[i].splineParsPointer); @@ -1029,9 +1027,9 @@ void SMonolith::CalcSplineWeights() { // We've read the segment straight from CPU and is saved in segment_gpu // polynomial parameters from the monolithic splineMonolith const float fY = cpu_spline_handler->coeff_many[CurrentKnotPos]; - const float fB = cpu_spline_handler->coeff_many[CurrentKnotPos+1]; - const float fC = cpu_spline_handler->coeff_many[CurrentKnotPos+2]; - const float fD = cpu_spline_handler->coeff_many[CurrentKnotPos+3]; + const float fB = cpu_spline_handler->coeff_many[CurrentKnotPos + 1]; + const float fC = cpu_spline_handler->coeff_many[CurrentKnotPos + 2]; + const float fD = cpu_spline_handler->coeff_many[CurrentKnotPos + 3]; // The is the variation itself (needed to evaluate variation - stored spline point = dx) const float dx = ParamValues[Param] - cpu_spline_handler->coeff_x[segment_X]; @@ -1050,12 +1048,12 @@ void SMonolith::CalcSplineWeights() { const float x = ParamValues[cpu_paramNo_TF1_arr[tf1Num]]; // Read the coefficients - const float a = cpu_coeff_TF1_many[tf1Num*_nTF1Coeff_]; - const float b = cpu_coeff_TF1_many[tf1Num*_nTF1Coeff_+1]; + const unsigned int TF1_Index = tf1Num * _nTF1Coeff_; + const float a = cpu_coeff_TF1_many[TF1_Index]; + const float b = cpu_coeff_TF1_many[TF1_Index + 1]; cpu_weights_tf1_var[tf1Num] = fmaf(a, x, b); // cpu_weights_tf1_var[tf1Num] = a*x + b; - //cpu_weights_tf1_var[splineNum] = 1 + a*x + b*x*x + c*x*x*x + d*x*x*x*x + e*x*x*x*x*x; } #ifdef MULTITHREAD @@ -1076,9 +1074,11 @@ void SMonolith::ModifyWeights(){ { float totalWeight = 1.0f; // Initialize total weight for each event + const unsigned int Offset = 2 * EventNum; + // Extract the parameters for the current event - const unsigned int startIndex = cpu_nParamPerEvent[2 * EventNum + 1]; - const unsigned int numParams = cpu_nParamPerEvent[2 * EventNum]; + const unsigned int startIndex = cpu_nParamPerEvent[Offset + 1]; + const unsigned int numParams = cpu_nParamPerEvent[Offset]; // Compute total weight for the current event #ifdef MULTITHREAD @@ -1089,8 +1089,8 @@ void SMonolith::ModifyWeights(){ } //Now TF1 // Extract the parameters for the current event - const unsigned int startIndex_tf1 = cpu_nParamPerEvent_tf1[2 * EventNum + 1]; - const unsigned int numParams_tf1 = cpu_nParamPerEvent_tf1[2 * EventNum]; + const unsigned int startIndex_tf1 = cpu_nParamPerEvent_tf1[Offset + 1]; + const unsigned int numParams_tf1 = cpu_nParamPerEvent_tf1[Offset]; // Compute total weight for the current event #ifdef MULTITHREAD diff --git a/splines/gpuSplineUtils.cu b/splines/gpuSplineUtils.cu index c1a84f979..65254ce31 100644 --- a/splines/gpuSplineUtils.cu +++ b/splines/gpuSplineUtils.cu @@ -104,7 +104,6 @@ SMonolithGPU::~SMonolithGPU(){ // ******************************************* // Initialiser when using the x array and combined y,b,c,d array __host__ void SMonolithGPU::InitGPU_SplineMonolith( -// ******************************************* #ifndef Weight_On_SplineBySpline_Basis float **cpu_total_weights, int n_events, @@ -113,7 +112,7 @@ __host__ void SMonolithGPU::InitGPU_SplineMonolith( unsigned int n_splines, unsigned int n_tf1, int Eve_size) { - +// ******************************************* // Allocate chunks of memory to GPU cudaMalloc((void **) &gpu_paramNo_arr, n_splines*sizeof(short int)); CudaCheckError(); @@ -197,7 +196,6 @@ __host__ void SMonolithGPU::InitGPU_Vals(float **vals) { // ****************************************************** // Copy to GPU for x array and separate ybcd array __host__ void SMonolithGPU::CopyToGPU_SplineMonolith( -// ****************************************************** SplineMonoStruct* cpu_spline_handler, // TFI related now @@ -214,6 +212,7 @@ __host__ void SMonolithGPU::CopyToGPU_SplineMonolith( short int spline_size, unsigned int total_nknots, unsigned int n_tf1) { +// ****************************************************** if (n_params != _N_SPLINES_) { printf("Number of splines not equal to %i, GPU code for event-by-event splines will fail\n", _N_SPLINES_); printf("n_params = %i\n", n_params); @@ -351,7 +350,6 @@ __global__ void EvalOnGPU_Splines( float* __restrict__ gpu_weights, const cudaTextureObject_t __restrict__ text_coeff_x) { //********************************************************* - // points per spline is the offset to skip in the index to move between splines const unsigned int splineNum = (blockIdx.x * blockDim.x + threadIdx.x); @@ -375,9 +373,9 @@ __global__ void EvalOnGPU_Splines( // We've read the segment straight from CPU and is saved in segment_gpu // polynomial parameters from the monolithic splineMonolith const float fY = gpu_coeff_many[CurrentKnotPos]; - const float fB = gpu_coeff_many[CurrentKnotPos+1]; - const float fC = gpu_coeff_many[CurrentKnotPos+2]; - const float fD = gpu_coeff_many[CurrentKnotPos+3]; + const float fB = gpu_coeff_many[CurrentKnotPos + 1]; + const float fC = gpu_coeff_many[CurrentKnotPos + 2]; + const float fD = gpu_coeff_many[CurrentKnotPos + 3]; // The is the variation itself (needed to evaluate variation - stored spline point = dx) const float dx = val_gpu[Param] - tex1Dfetch(text_coeff_x, segment_X); @@ -399,7 +397,6 @@ __global__ void EvalOnGPU_TF1( const short int* __restrict__ gpu_paramNo_arr_tf1, float* __restrict__ gpu_weights_tf1) { //********************************************************* - // points per spline is the offset to skip in the index to move between splines const unsigned int tf1Num = (blockIdx.x * blockDim.x + threadIdx.x); @@ -408,18 +405,14 @@ __global__ void EvalOnGPU_TF1( const float x = val_gpu[gpu_paramNo_arr_tf1[tf1Num]]; // Read the coefficients - const float a = gpu_coeffs_tf1[tf1Num*_nTF1Coeff_]; - const float b = gpu_coeffs_tf1[tf1Num*_nTF1Coeff_+1]; + const unsigned int TF1_Index = tf1Num * _nTF1Coeff_; + const float a = gpu_coeffs_tf1[TF1_Index]; + const float b = gpu_coeffs_tf1[TF1_Index+1]; gpu_weights_tf1[tf1Num] = fmaf(a, x, b); // gpu_weights_tf1[tf1Num] = a*x + b; - // //gpu_weights_tf1[tf1Num] = 1 + a*x + b*x*x + c*x*x*x + d*x*x*x*x + e*x*x*x*x*x; - - //#ifdef DEBUG - //printf("TF1 = %i/%i, weight = %f \n", tf1Num, d_n_TF1, gpu_weights_tf1[tf1Num]); - //#endif } } @@ -443,24 +436,14 @@ __global__ void EvalOnGPU_TotWeight( { shared_total_weights[threadIdx.x] = 1.f; - const unsigned int EventOffset = 2*EventNum; + const unsigned int EventOffset = 2 * EventNum; - for (unsigned int id = 0; id < tex1Dfetch(text_nParamPerEvent, EventOffset); ++id) - { + for (unsigned int id = 0; id < tex1Dfetch(text_nParamPerEvent, EventOffset); ++id) { shared_total_weights[threadIdx.x] *= gpu_weights[tex1Dfetch(text_nParamPerEvent, EventOffset+1) + id]; - //#ifdef DEBUG - //printf("Event = %i, Spline_Num = %i, gpu_weights = %f \n", - // EventNum, tex1Dfetch(text_nParamPerEvent, 2*EventNum+1) + id, gpu_weights[tex1Dfetch(text_nParamPerEvent, 2*EventNum+1) + id]); - //#endif } - for (unsigned int id = 0; id < tex1Dfetch(text_nParamPerEvent_TF1, EventOffset); ++id) - { + for (unsigned int id = 0; id < tex1Dfetch(text_nParamPerEvent_TF1, EventOffset); ++id) { shared_total_weights[threadIdx.x] *= gpu_weights_tf1[tex1Dfetch(text_nParamPerEvent_TF1, EventOffset+1) + id]; - //#ifdef DEBUG - //printf("Event = %i, Spline_Num = %i, gpu_weights_tf1 = %f \n", - // EventNum, tex1Dfetch(text_nParamPerEvent_TF1, 2*EventNum+1) + id, gpu_weights_tf1[tex1Dfetch(text_nParamPerEvent_TF1, 2*EventNum+1) + id]); - //#endif } gpu_total_weights[EventNum] = shared_total_weights[threadIdx.x]; }