From 9c24fe74eb2349e2f0871b8ce429ef1d6adcc15d Mon Sep 17 00:00:00 2001
From: Kamil Skwarczynski <skwarczynskikamil@gmail.com>
Date: Sat, 8 Feb 2025 11:20:44 +0000
Subject: [PATCH] Minor speedup due to reusing calcuaiton isntead of repeating
 them

---
 covariance/covarianceBase.cpp |  4 ++--
 splines/SplineMonolith.cpp    | 30 +++++++++++++--------------
 splines/gpuSplineUtils.cu     | 39 ++++++++++-------------------------
 3 files changed, 28 insertions(+), 45 deletions(-)
diff --git a/covariance/covarianceBase.cpp b/covariance/covarianceBase.cpp
index 922c20175..ccac2b5eb 100644
--- a/covariance/covarianceBase.cpp
+++ b/covariance/covarianceBase.cpp
@@ -667,8 +667,8 @@ double covarianceBase::CalcLikelihood() const _noexcept_ {
     for (int j = 0; j <= i; ++j) {
       if (!_fFlatPrior[i] && !_fFlatPrior[j]) {
         //KS: Since matrix is symmetric we can calculate non diagonal elements only once and multiply by 2, can bring up to factor speed decrease.
-        short int scale = (i != j) ? 2 : 1;
-        logL += scale * 0.5*(_fPropVal[i] - _fPreFitValue[i])*(_fPropVal[j] - _fPreFitValue[j])*InvertCovMatrix[i][j];
+        double scale = (i != j) ? 1. : 0.5;
+        logL += scale * (_fPropVal[i] - _fPreFitValue[i])*(_fPropVal[j] - _fPreFitValue[j])*InvertCovMatrix[i][j];
       }
     }
   }
diff --git a/splines/SplineMonolith.cpp b/splines/SplineMonolith.cpp
index 4e4a0074f..e41e813ee 100644
--- a/splines/SplineMonolith.cpp
+++ b/splines/SplineMonolith.cpp
@@ -525,7 +525,7 @@ void SMonolith::LoadSplineFile(std::string FileName) {
       FileName.insert(0, std::string(std::getenv("MACH3"))+"/");
    }
 
-  TFile *SplineFile = new TFile(FileName.c_str(), "OPEN");
+  auto SplineFile = std::make_unique<TFile>(FileName.c_str(), "OPEN");
   TTree *Settings = SplineFile->Get<TTree>("Settings");
   TTree *Monolith = SplineFile->Get<TTree>("Monolith");
   TTree *Monolith_TF1 = SplineFile->Get<TTree>("Monolith_TF1");
@@ -655,7 +655,6 @@ void SMonolith::LoadSplineFile(std::string FileName) {
   }
 
   SplineFile->Close();
-  delete SplineFile;
 
   // Print some info; could probably make this to a separate function
   PrintInitialsiation();
@@ -672,7 +671,7 @@ void SMonolith::PrepareSplineFile() {
       FileName.insert(0, std::string(std::getenv("MACH3"))+"/");
    }
 
-  TFile *SplineFile = new TFile(FileName.c_str(), "recreate");
+  auto SplineFile = std::make_unique<TFile>(FileName.c_str(), "recreate");
   TTree *Settings = new TTree("Settings", "Settings");
   TTree *Monolith = new TTree("Monolith", "Monolith");
   TTree *Monolith_TF1 = new TTree("Monolith_TF1", "Monolith_TF1");
@@ -791,7 +790,6 @@ void SMonolith::PrepareSplineFile() {
   delete EventInfo;
   delete FastSplineInfoTree;
   SplineFile->Close();
-  delete SplineFile;
 }
 
 // *****************************************
@@ -929,7 +927,7 @@ void SMonolith::FindSplineSegment() {
   for (M3::int_t i = 0; i < nParams; ++i)
   {
     const M3::int_t nPoints = SplineInfoArray[i].nPts;
-    const M3::float_t* xArray = SplineInfoArray[i].xPts;
+    const M3::float_t* _restrict_ xArray = SplineInfoArray[i].xPts;
 
     // Get the variation for this reconfigure for the ith parameter
     const float xvar = float(*SplineInfoArray[i].splineParsPointer);
@@ -1029,9 +1027,9 @@ void SMonolith::CalcSplineWeights() {
       // We've read the segment straight from CPU and is saved in segment_gpu
       // polynomial parameters from the monolithic splineMonolith
       const float fY = cpu_spline_handler->coeff_many[CurrentKnotPos];
-      const float fB = cpu_spline_handler->coeff_many[CurrentKnotPos+1];
-      const float fC = cpu_spline_handler->coeff_many[CurrentKnotPos+2];
-      const float fD = cpu_spline_handler->coeff_many[CurrentKnotPos+3];
+      const float fB = cpu_spline_handler->coeff_many[CurrentKnotPos + 1];
+      const float fC = cpu_spline_handler->coeff_many[CurrentKnotPos + 2];
+      const float fD = cpu_spline_handler->coeff_many[CurrentKnotPos + 3];
       // The is the variation itself (needed to evaluate variation - stored spline point = dx)
       const float dx = ParamValues[Param] - cpu_spline_handler->coeff_x[segment_X];
 
@@ -1050,12 +1048,12 @@ void SMonolith::CalcSplineWeights() {
       const float x = ParamValues[cpu_paramNo_TF1_arr[tf1Num]];
 
       // Read the coefficients
-      const float a = cpu_coeff_TF1_many[tf1Num*_nTF1Coeff_];
-      const float b = cpu_coeff_TF1_many[tf1Num*_nTF1Coeff_+1];
+      const unsigned int TF1_Index = tf1Num * _nTF1Coeff_;
+      const float a = cpu_coeff_TF1_many[TF1_Index];
+      const float b = cpu_coeff_TF1_many[TF1_Index + 1];
 
       cpu_weights_tf1_var[tf1Num] = fmaf(a, x, b);
       // cpu_weights_tf1_var[tf1Num] = a*x + b;
-
       //cpu_weights_tf1_var[splineNum] = 1 + a*x + b*x*x + c*x*x*x + d*x*x*x*x + e*x*x*x*x*x;
     }
   #ifdef MULTITHREAD
@@ -1076,9 +1074,11 @@ void SMonolith::ModifyWeights(){
   {
     float totalWeight = 1.0f; // Initialize total weight for each event
 
+    const unsigned int Offset = 2 * EventNum;
+
     // Extract the parameters for the current event
-    const unsigned int startIndex = cpu_nParamPerEvent[2 * EventNum + 1];
-    const unsigned int numParams = cpu_nParamPerEvent[2 * EventNum];
+    const unsigned int startIndex = cpu_nParamPerEvent[Offset + 1];
+    const unsigned int numParams = cpu_nParamPerEvent[Offset];
 
     // Compute total weight for the current event
     #ifdef MULTITHREAD
@@ -1089,8 +1089,8 @@ void SMonolith::ModifyWeights(){
     }
     //Now TF1
     // Extract the parameters for the current event
-    const unsigned int startIndex_tf1 = cpu_nParamPerEvent_tf1[2 * EventNum + 1];
-    const unsigned int numParams_tf1 = cpu_nParamPerEvent_tf1[2 * EventNum];
+    const unsigned int startIndex_tf1 = cpu_nParamPerEvent_tf1[Offset + 1];
+    const unsigned int numParams_tf1 = cpu_nParamPerEvent_tf1[Offset];
 
     // Compute total weight for the current event
     #ifdef MULTITHREAD
diff --git a/splines/gpuSplineUtils.cu b/splines/gpuSplineUtils.cu
index c1a84f979..65254ce31 100644
--- a/splines/gpuSplineUtils.cu
+++ b/splines/gpuSplineUtils.cu
@@ -104,7 +104,6 @@ SMonolithGPU::~SMonolithGPU(){
 // *******************************************
 // Initialiser when using the x array and combined y,b,c,d array
 __host__ void SMonolithGPU::InitGPU_SplineMonolith(
-// *******************************************
                  #ifndef Weight_On_SplineBySpline_Basis
                           float **cpu_total_weights, 
                           int n_events,                              
@@ -113,7 +112,7 @@ __host__ void SMonolithGPU::InitGPU_SplineMonolith(
                           unsigned int n_splines,
                           unsigned int n_tf1,
                           int Eve_size) {
-
+// *******************************************
   // Allocate chunks of memory to GPU
   cudaMalloc((void **) &gpu_paramNo_arr, n_splines*sizeof(short int));
   CudaCheckError();
@@ -197,7 +196,6 @@ __host__ void SMonolithGPU::InitGPU_Vals(float **vals) {
 // ******************************************************
 // Copy to GPU for x array and separate ybcd array
 __host__ void SMonolithGPU::CopyToGPU_SplineMonolith(
-// ******************************************************
                             SplineMonoStruct* cpu_spline_handler,
 
                             // TFI related now
@@ -214,6 +212,7 @@ __host__ void SMonolithGPU::CopyToGPU_SplineMonolith(
                             short int spline_size,
                             unsigned int total_nknots,
                             unsigned int n_tf1) {
+// ******************************************************
   if (n_params != _N_SPLINES_) {
     printf("Number of splines not equal to %i, GPU code for event-by-event splines will fail\n", _N_SPLINES_);
     printf("n_params = %i\n", n_params);
@@ -351,7 +350,6 @@ __global__ void EvalOnGPU_Splines(
   float* __restrict__ gpu_weights,
   const cudaTextureObject_t __restrict__ text_coeff_x) {
 //*********************************************************
-
   // points per spline is the offset to skip in the index to move between splines
   const unsigned int splineNum = (blockIdx.x * blockDim.x + threadIdx.x);
 
@@ -375,9 +373,9 @@ __global__ void EvalOnGPU_Splines(
     // We've read the segment straight from CPU and is saved in segment_gpu
     // polynomial parameters from the monolithic splineMonolith
     const float fY = gpu_coeff_many[CurrentKnotPos];
-    const float fB = gpu_coeff_many[CurrentKnotPos+1];
-    const float fC = gpu_coeff_many[CurrentKnotPos+2];
-    const float fD = gpu_coeff_many[CurrentKnotPos+3];
+    const float fB = gpu_coeff_many[CurrentKnotPos + 1];
+    const float fC = gpu_coeff_many[CurrentKnotPos + 2];
+    const float fD = gpu_coeff_many[CurrentKnotPos + 3];
     // The is the variation itself (needed to evaluate variation - stored spline point = dx)
     const float dx = val_gpu[Param] - tex1Dfetch<float>(text_coeff_x, segment_X);
 
@@ -399,7 +397,6 @@ __global__ void EvalOnGPU_TF1(
     const short int* __restrict__ gpu_paramNo_arr_tf1,
     float* __restrict__ gpu_weights_tf1) {
 //*********************************************************
-
   // points per spline is the offset to skip in the index to move between splines
   const unsigned int tf1Num = (blockIdx.x * blockDim.x + threadIdx.x);
 
@@ -408,18 +405,14 @@ __global__ void EvalOnGPU_TF1(
     const float x = val_gpu[gpu_paramNo_arr_tf1[tf1Num]];
 
     // Read the coefficients
-    const float a = gpu_coeffs_tf1[tf1Num*_nTF1Coeff_];
-    const float b = gpu_coeffs_tf1[tf1Num*_nTF1Coeff_+1];
+    const unsigned int TF1_Index = tf1Num * _nTF1Coeff_;
+    const float a = gpu_coeffs_tf1[TF1_Index];
+    const float b = gpu_coeffs_tf1[TF1_Index+1];
 
     gpu_weights_tf1[tf1Num] = fmaf(a, x, b);
 
     // gpu_weights_tf1[tf1Num] = a*x + b;
-    //
     //gpu_weights_tf1[tf1Num] = 1 + a*x + b*x*x + c*x*x*x + d*x*x*x*x + e*x*x*x*x*x;
-
-    //#ifdef DEBUG
-    //printf("TF1 = %i/%i, weight = %f \n", tf1Num, d_n_TF1, gpu_weights_tf1[tf1Num]);
-    //#endif
   }
 }
 
@@ -443,24 +436,14 @@ __global__ void EvalOnGPU_TotWeight(
   {
     shared_total_weights[threadIdx.x] = 1.f;
 
-    const unsigned int EventOffset = 2*EventNum;
+    const unsigned int EventOffset = 2 * EventNum;
 
-    for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset); ++id)
-    {
+    for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset); ++id) {
       shared_total_weights[threadIdx.x] *= gpu_weights[tex1Dfetch<unsigned int>(text_nParamPerEvent, EventOffset+1) + id];
-      //#ifdef DEBUG
-      //printf("Event = %i, Spline_Num = %i, gpu_weights = %f \n",
-      //        EventNum, tex1Dfetch<unsigned int>(text_nParamPerEvent, 2*EventNum+1) + id, gpu_weights[tex1Dfetch<unsigned int>(text_nParamPerEvent, 2*EventNum+1) + id]);
-      //#endif
     }
 
-    for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset); ++id)
-    {
+    for (unsigned int id = 0; id < tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset); ++id) {
       shared_total_weights[threadIdx.x] *= gpu_weights_tf1[tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, EventOffset+1) + id];
-      //#ifdef DEBUG
-      //printf("Event = %i, Spline_Num = %i, gpu_weights_tf1 = %f \n",
-      //       EventNum, tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, 2*EventNum+1) + id, gpu_weights_tf1[tex1Dfetch<unsigned int>(text_nParamPerEvent_TF1, 2*EventNum+1) + id]);
-      //#endif
     }
     gpu_total_weights[EventNum] = shared_total_weights[threadIdx.x];
   }