From bfdb426157c4e55623616fd81b894cc28524ef38 Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Sun, 3 Mar 2024 18:52:45 +0100 Subject: [PATCH] [susy2] ** COMPLETE SUSY2 ** go back to itscrd90 test logs git checkout f4d951c7ddfc635707c14e0fe5a0628fd4aec0ac tput/logs_* tmad/logs_* --- .../log_eemumu_mad_d_inl0_hrd0.txt | 308 +++++++++++------ .../log_eemumu_mad_f_inl0_hrd0.txt | 310 +++++++++++------ .../log_eemumu_mad_m_inl0_hrd0.txt | 306 +++++++++++------ .../log_ggtt_mad_d_inl0_hrd0.txt | 308 +++++++++++------ .../log_ggtt_mad_f_inl0_hrd0.txt | 306 +++++++++++------ .../log_ggtt_mad_m_inl0_hrd0.txt | 304 +++++++++++------ .../log_ggttg_mad_d_inl0_hrd0.txt | 306 +++++++++++------ .../log_ggttg_mad_f_inl0_hrd0.txt | 312 +++++++++++------ .../log_ggttg_mad_m_inl0_hrd0.txt | 308 +++++++++++------ .../log_ggttgg_mad_d_inl0_hrd0.txt | 306 +++++++++++------ .../log_ggttgg_mad_f_inl0_hrd0.txt | 308 +++++++++++------ .../log_ggttgg_mad_m_inl0_hrd0.txt | 310 +++++++++++------ .../log_ggttggg_mad_d_inl0_hrd0.txt | 312 +++++++++++------ .../log_ggttggg_mad_f_inl0_hrd0.txt | 316 ++++++++++++------ .../log_ggttggg_mad_m_inl0_hrd0.txt | 310 +++++++++++------ .../log_gqttq_mad_d_inl0_hrd0.txt | 310 +++++++++++------ .../log_gqttq_mad_f_inl0_hrd0.txt | 312 +++++++++++------ .../log_gqttq_mad_m_inl0_hrd0.txt | 310 +++++++++++------ .../log_eemumu_mad_d_inl0_hrd0.txt | 215 ++++++------ .../log_eemumu_mad_d_inl0_hrd0_bridge.txt | 223 ++++++------ .../log_eemumu_mad_d_inl0_hrd0_common.txt | 195 ++++++----- .../log_eemumu_mad_d_inl0_hrd0_curhst.txt | 190 +++++++---- .../log_eemumu_mad_d_inl0_hrd0_rmbhst.txt | 217 ++++++------ .../log_eemumu_mad_d_inl0_hrd1.txt | 215 ++++++------ .../log_eemumu_mad_d_inl1_hrd0.txt | 215 ++++++------ .../log_eemumu_mad_d_inl1_hrd1.txt | 215 ++++++------ .../log_eemumu_mad_f_inl0_hrd0.txt | 215 ++++++------ .../log_eemumu_mad_f_inl0_hrd0_bridge.txt | 223 ++++++------ .../log_eemumu_mad_f_inl0_hrd0_common.txt | 195 ++++++----- .../log_eemumu_mad_f_inl0_hrd0_curhst.txt | 190 +++++++---- .../log_eemumu_mad_f_inl0_hrd0_rmbhst.txt | 217 ++++++------ .../log_eemumu_mad_f_inl0_hrd1.txt | 215 ++++++------ .../log_eemumu_mad_f_inl1_hrd0.txt | 215 ++++++------ .../log_eemumu_mad_f_inl1_hrd1.txt | 215 ++++++------ .../log_eemumu_mad_m_inl0_hrd0.txt | 215 ++++++------ .../log_eemumu_mad_m_inl0_hrd1.txt | 215 ++++++------ .../log_ggtt_mad_d_inl0_hrd0.txt | 217 ++++++------ .../log_ggtt_mad_d_inl0_hrd0_bridge.txt | 225 +++++++------ .../log_ggtt_mad_d_inl0_hrd0_common.txt | 197 ++++++----- .../log_ggtt_mad_d_inl0_hrd0_curhst.txt | 192 +++++++---- .../log_ggtt_mad_d_inl0_hrd0_rmbhst.txt | 219 ++++++------ .../log_ggtt_mad_d_inl0_hrd1.txt | 217 ++++++------ .../log_ggtt_mad_d_inl1_hrd0.txt | 215 ++++++------ .../log_ggtt_mad_d_inl1_hrd1.txt | 215 ++++++------ .../log_ggtt_mad_f_inl0_hrd0.txt | 217 ++++++------ .../log_ggtt_mad_f_inl0_hrd0_bridge.txt | 225 +++++++------ .../log_ggtt_mad_f_inl0_hrd0_common.txt | 197 ++++++----- .../log_ggtt_mad_f_inl0_hrd0_curhst.txt | 192 +++++++---- .../log_ggtt_mad_f_inl0_hrd0_rmbhst.txt | 219 ++++++------ .../log_ggtt_mad_f_inl0_hrd1.txt | 217 ++++++------ .../log_ggtt_mad_f_inl1_hrd0.txt | 215 ++++++------ .../log_ggtt_mad_f_inl1_hrd1.txt | 215 ++++++------ .../log_ggtt_mad_m_inl0_hrd0.txt | 217 ++++++------ .../log_ggtt_mad_m_inl0_hrd1.txt | 217 ++++++------ .../log_ggttg_mad_d_inl0_hrd0.txt | 231 +++++++------ .../log_ggttg_mad_d_inl0_hrd0_bridge.txt | 243 ++++++++------ .../log_ggttg_mad_d_inl0_hrd1.txt | 231 +++++++------ .../log_ggttg_mad_f_inl0_hrd0.txt | 231 +++++++------ .../log_ggttg_mad_f_inl0_hrd0_bridge.txt | 243 ++++++++------ .../log_ggttg_mad_f_inl0_hrd1.txt | 231 +++++++------ .../log_ggttg_mad_m_inl0_hrd0.txt | 233 +++++++------ .../log_ggttg_mad_m_inl0_hrd1.txt | 233 +++++++------ .../log_ggttgg_mad_d_inl0_hrd0.txt | 231 +++++++------ .../log_ggttgg_mad_d_inl0_hrd0_bridge.txt | 243 ++++++++------ .../log_ggttgg_mad_d_inl0_hrd0_common.txt | 211 +++++++----- .../log_ggttgg_mad_d_inl0_hrd0_curhst.txt | 206 ++++++++---- .../log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt | 234 +++++++------ .../log_ggttgg_mad_d_inl0_hrd1.txt | 231 +++++++------ .../log_ggttgg_mad_d_inl1_hrd0.txt | 231 +++++++------ .../log_ggttgg_mad_d_inl1_hrd1.txt | 231 +++++++------ .../log_ggttgg_mad_f_inl0_hrd0.txt | 231 +++++++------ .../log_ggttgg_mad_f_inl0_hrd0_bridge.txt | 243 ++++++++------ .../log_ggttgg_mad_f_inl0_hrd0_common.txt | 211 +++++++----- .../log_ggttgg_mad_f_inl0_hrd0_curhst.txt | 206 ++++++++---- .../log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt | 234 +++++++------ .../log_ggttgg_mad_f_inl0_hrd1.txt | 237 +++++++------ .../log_ggttgg_mad_f_inl1_hrd0.txt | 235 +++++++------ .../log_ggttgg_mad_f_inl1_hrd1.txt | 235 +++++++------ .../log_ggttgg_mad_m_inl0_hrd0.txt | 231 +++++++------ .../log_ggttgg_mad_m_inl0_hrd1.txt | 231 +++++++------ .../log_ggttggg_mad_d_inl0_hrd0.txt | 231 +++++++------ .../log_ggttggg_mad_d_inl0_hrd0_bridge.txt | 243 ++++++++------ .../log_ggttggg_mad_d_inl0_hrd1.txt | 231 +++++++------ .../log_ggttggg_mad_f_inl0_hrd0.txt | 235 +++++++------ .../log_ggttggg_mad_f_inl0_hrd0_bridge.txt | 247 ++++++++------ .../log_ggttggg_mad_f_inl0_hrd1.txt | 235 +++++++------ .../log_ggttggg_mad_m_inl0_hrd0.txt | 231 +++++++------ .../log_ggttggg_mad_m_inl0_hrd1.txt | 231 +++++++------ .../log_gqttq_mad_d_inl0_hrd0.txt | 231 +++++++------ .../log_gqttq_mad_d_inl0_hrd0_bridge.txt | 243 ++++++++------ .../log_gqttq_mad_d_inl0_hrd1.txt | 231 +++++++------ .../log_gqttq_mad_f_inl0_hrd0.txt | 231 +++++++------ .../log_gqttq_mad_f_inl0_hrd0_bridge.txt | 243 ++++++++------ .../log_gqttq_mad_f_inl0_hrd1.txt | 231 +++++++------ .../log_gqttq_mad_m_inl0_hrd0.txt | 231 +++++++------ .../log_gqttq_mad_m_inl0_hrd1.txt | 231 +++++++------ 96 files changed, 13818 insertions(+), 9116 deletions(-) diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index a540c52e3f..fb2022a061 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none - +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-03-01_19:37:08 +DATE: 2024-03-01_03:35:28 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.4698s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4632s - [COUNTERS] Fortran MEs ( 1 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.6832s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6748s + [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.71E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1232s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1166s - [COUNTERS] Fortran MEs ( 1 ) : 0.0066s for 8192 events => throughput is 1.23E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1761s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1679s + [COUNTERS] Fortran MEs ( 1 ) : 0.0082s for 8192 events => throughput is 9.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2640s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1959s - [COUNTERS] Fortran MEs ( 1 ) : 0.0681s for 90112 events => throughput is 1.32E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3673s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2800s + [COUNTERS] Fortran MEs ( 1 ) : 0.0874s for 90112 events => throughput is 1.03E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661545E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1273s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1218s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0056s for 8192 events => throughput is 1.47E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1811s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1745s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2606s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2024s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0582s for 90112 events => throughput is 1.55E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3805s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3053s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0752s for 90112 events => throughput is 1.20E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.577544e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.174335e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.582008e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.235605e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1234s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1201s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0033s for 8192 events => throughput is 2.48E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1874s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1831s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0043s for 8192 events => throughput is 1.89E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2364s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2023s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0341s for 90112 events => throughput is 2.64E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3298s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2854s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0444s for 90112 events => throughput is 2.03E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.707525e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.003456e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.816473e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.071261e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1205s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1181s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.35E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1738s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1708s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.75E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2272s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2006s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0266s for 90112 events => throughput is 3.39E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3165s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2833s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0332s for 90112 events => throughput is 2.71E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.530014e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.590204e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.685057e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.724231e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1203s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1180s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.50E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1747s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1718s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0029s for 8192 events => throughput is 2.78E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2270s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2014s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0256s for 90112 events => throughput is 3.52E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3170s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2851s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0318s for 90112 events => throughput is 2.83E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.623987e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.651963e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.843267e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.888816e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1200s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1182s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0019s for 8192 events => throughput is 4.42E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1756s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1721s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.33E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000739E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2195s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2001s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0193s for 90112 events => throughput is 4.66E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3283s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2888s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0395s for 90112 events => throughput is 2.28E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000739E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.833138e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.333417e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.160270e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.247580e+06 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.5894s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5889s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.58E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715404661532E-002) differ by less than 3E-14 (0.0) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515602020000753E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.7068s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7019s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 90112 events => throughput is 1.84E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602020000753E-002) differ by less than 3E-14 (1.1102230246251565e-16) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.143768e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.922192e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.720542e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.434610e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.732238e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.027929e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.748145e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.129848e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index 01d3bcd1ad..130936da07 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - - make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-03-01_19:37:12 +DATE: 2024-03-01_03:35:44 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.4777s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4710s - [COUNTERS] Fortran MEs ( 1 ) : 0.0066s for 8192 events => throughput is 1.23E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7004s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6920s + [COUNTERS] Fortran MEs ( 1 ) : 0.0084s for 8192 events => throughput is 9.73E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1239s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1173s - [COUNTERS] Fortran MEs ( 1 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1752s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1674s + [COUNTERS] Fortran MEs ( 1 ) : 0.0079s for 8192 events => throughput is 1.04E+06 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2678s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2000s - [COUNTERS] Fortran MEs ( 1 ) : 0.0678s for 90112 events => throughput is 1.33E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3760s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2888s + [COUNTERS] Fortran MEs ( 1 ) : 0.0872s for 90112 events => throughput is 1.03E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382703205998396E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1242s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1193s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0049s for 8192 events => throughput is 1.67E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1795s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1733s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0062s for 8192 events => throughput is 1.32E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515590123565249E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2527s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2015s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0512s for 90112 events => throughput is 1.76E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3578s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2889s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0690s for 90112 events => throughput is 1.31E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515590123565249E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.824486e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.296058e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.843269e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.289423e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382700723828302E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1192s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1173s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0019s for 8192 events => throughput is 4.20E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1759s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1734s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0025s for 8192 events => throughput is 3.33E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515587612890761E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2180s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1970s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0210s for 90112 events => throughput is 4.30E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3141s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2860s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0281s for 90112 events => throughput is 3.21E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587612890761E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.470047e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.247103e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.654667e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.346461e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1210s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1192s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.53E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1759s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1735s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.49E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2216s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2017s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0198s for 90112 events => throughput is 4.55E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3181s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2923s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0258s for 90112 events => throughput is 3.50E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587619408464E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.847882e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.473027e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.058914e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.779574e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382700679354239E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1207s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1189s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0018s for 8192 events => throughput is 4.54E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1764s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1741s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.61E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515587619408464E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2203s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2008s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0195s for 90112 events => throughput is 4.62E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3134s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2887s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0247s for 90112 events => throughput is 3.65E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515587619408464E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.901436e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.393313e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.119507e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.850238e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382704335459282E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1190s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1178s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0012s for 8192 events => throughput is 6.69E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1750s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1726s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.46E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515591296252558E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2147s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2012s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0134s for 90112 events => throughput is 6.70E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3156s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2901s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0255s for 90112 events => throughput is 3.53E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515591296252558E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.099144e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.340689e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 7.624026e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.795181e+06 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382706077425631E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.5865s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5861s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.74E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382706077425631E-002) differ by less than 4E-4 (9.988182347875352e-08) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515592892887687E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.7069s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7024s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0045s for 90112 events => throughput is 1.98E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515592892887687E-002) differ by less than 4E-4 (9.973286385633884e-08) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.528794e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.178202e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.848804e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.051133e+09 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.014035e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.222690e+09 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.412951e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.409232e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index aa33fb4f59..da7367ae5e 100644 --- a/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' OMP_NUM_THREADS= -DATE: 2024-03-01_19:37:16 +DATE: 2024-03-01_03:36:01 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 3798 events (found 8192 events) - [COUNTERS] PROGRAM TOTAL : 0.4707s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4640s - [COUNTERS] Fortran MEs ( 1 ) : 0.0066s for 8192 events => throughput is 1.23E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.7189s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7106s + [COUNTERS] Fortran MEs ( 1 ) : 0.0083s for 8192 events => throughput is 9.88E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/avalassi/output_eemumu_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x1_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715404661532E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1250s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1183s - [COUNTERS] Fortran MEs ( 1 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1780s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1693s + [COUNTERS] Fortran MEs ( 1 ) : 0.0086s for 8192 events => throughput is 9.48E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/avalassi/output_eemumu_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_eemumu_x10_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602020000766E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2620s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1946s - [COUNTERS] Fortran MEs ( 1 ) : 0.0674s for 90112 events => throughput is 1.34E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3702s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2823s + [COUNTERS] Fortran MEs ( 1 ) : 0.0879s for 90112 events => throughput is 1.03E+06 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715420701395E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1278s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1222s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0056s for 8192 events => throughput is 1.45E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1808s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1742s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0066s for 8192 events => throughput is 1.24E+06 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2613s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2018s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0594s for 90112 events => throughput is 1.52E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3592s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2860s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0732s for 90112 events => throughput is 1.23E+06 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602033080859E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.544290e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.182030e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.541211e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.222787e+06 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715420701354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1220s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1189s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.65E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1746s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1707s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0040s for 8192 events => throughput is 2.07E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602033080859E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2330s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1999s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 90112 events => throughput is 2.72E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3256s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2834s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0422s for 90112 events => throughput is 2.14E+06 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602033080859E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.764036e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.086150e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.888476e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.131619e+06 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1210s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1185s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0024s for 8192 events => throughput is 3.38E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1726s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1696s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0030s for 8192 events => throughput is 2.76E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2266s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2002s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0264s for 90112 events => throughput is 3.42E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3229s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2892s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0338s for 90112 events => throughput is 2.67E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.562659e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.541763e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.728436e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.454900e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1208s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1185s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0023s for 8192 events => throughput is 3.57E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1830s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1799s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0031s for 8192 events => throughput is 2.61E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2250s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1999s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0251s for 90112 events => throughput is 3.59E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3167s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2848s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0319s for 90112 events => throughput is 2.83E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.748896e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.677035e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.946216e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.872617e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09338 [9.3382715383664494E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1591 events (found 1595 events) - [COUNTERS] PROGRAM TOTAL : 0.1190s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1172s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0017s for 8192 events => throughput is 4.73E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.1756s + [COUNTERS] Fortran Overhead ( 0 ) : 0.1722s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0035s for 8192 events => throughput is 2.36E+06 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 4/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_eemumu_x [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.09152 [9.1515602022697845E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1782 events (found 1787 events) - [COUNTERS] PROGRAM TOTAL : 0.2180s - [COUNTERS] Fortran Overhead ( 0 ) : 0.1992s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0188s for 90112 events => throughput is 4.80E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3270s + [COUNTERS] Fortran Overhead ( 0 ) : 0.2890s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0380s for 90112 events => throughput is 2.37E+06 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602022697845E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.001566e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.248118e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.332952e+06 ) sec^-1 +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.400436e+06 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x1_cudacpp > /tmp/avalassi/output_eemumu_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09338 [9.3382715392009194E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1591 events (found 1595 events) + [COUNTERS] PROGRAM TOTAL : 0.5902s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5897s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.66E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.3382715404661532E-002) and cpp (9.3382715392009194E-002) differ by less than 2E-4 (1.3548906441229747e-10) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_eemumu_x10_cudacpp > /tmp/avalassi/output_eemumu_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 4/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.09152 [9.1515602021089631E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1782 events (found 1787 events) + [COUNTERS] PROGRAM TOTAL : 0.7064s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7016s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0048s for 90112 events => throughput is 1.87E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (9.1515602020000766E-002) and cpp (9.1515602021089631E-002) differ by less than 2E-4 (1.1898038110302878e-11) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.153365e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.922960e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.732117e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.451486e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.736678e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.069247e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.733211e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.156375e+08 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index f736b14f89..657075d34f 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 - make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-03-01_19:37:20 +DATE: 2024-03-01_03:36:18 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.5241s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4919s - [COUNTERS] Fortran MEs ( 1 ) : 0.0322s for 8192 events => throughput is 2.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.8052s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7640s + [COUNTERS] Fortran MEs ( 1 ) : 0.0412s for 8192 events => throughput is 1.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2693s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2371s - [COUNTERS] Fortran MEs ( 1 ) : 0.0322s for 8192 events => throughput is 2.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3849s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3442s + [COUNTERS] Fortran MEs ( 1 ) : 0.0408s for 8192 events => throughput is 2.01E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.2093s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8563s - [COUNTERS] Fortran MEs ( 1 ) : 0.3530s for 90112 events => throughput is 2.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6297s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1833s + [COUNTERS] Fortran MEs ( 1 ) : 0.4464s for 90112 events => throughput is 2.02E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756647] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2948s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2654s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0294s for 8192 events => throughput is 2.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4282s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3910s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0373s for 8192 events => throughput is 2.20E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.2065s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8827s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3238s for 90112 events => throughput is 2.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6901s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2834s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4066s for 90112 events => throughput is 2.22E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.789923e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.207121e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.858469e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.224007e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2706s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2539s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0167s for 8192 events => throughput is 4.90E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3927s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3711s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0217s for 8192 events => throughput is 3.78E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989106] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.0538s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8701s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1836s for 90112 events => throughput is 4.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4997s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2628s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2369s for 90112 events => throughput is 3.80E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989106) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.827269e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.699229e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.939228e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.772412e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2562s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2462s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0101s for 8192 events => throughput is 8.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3811s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3679s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0132s for 8192 events => throughput is 6.20E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9712s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8607s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1105s for 90112 events => throughput is 8.16E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4003s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2548s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1455s for 90112 events => throughput is 6.19E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.441330e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.020313e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.515876e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.141769e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2546s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2450s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0096s for 8192 events => throughput is 8.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3737s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3616s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0121s for 8192 events => throughput is 6.75E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9629s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8596s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1033s for 90112 events => throughput is 8.72E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3804s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2520s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1283s for 90112 events => throughput is 7.02E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.937684e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.898875e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.116082e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.924828e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756626] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2551s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2456s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 8192 events => throughput is 8.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3919s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3726s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0193s for 8192 events => throughput is 4.24E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9646s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8611s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1035s for 90112 events => throughput is 8.71E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5267s + [COUNTERS] Fortran Overhead ( 0 ) : 1.3034s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2233s for 90112 events => throughput is 4.04E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.551823e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.791161e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.755678e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.782832e+05 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.7828s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7823s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.45E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094184803756640) differ by less than 3E-14 (0.0) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105695279989121] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.6782s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6718s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.42E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279989121) differ by less than 3E-14 (2.220446049250313e-16) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.045663e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.714246e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.010596e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.071675e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.000853e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.152555e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.001515e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.100234e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index 235aa30713..eb011c6697 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,4 +1,4 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none @@ -6,37 +6,37 @@ make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 - make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-03-01_19:37:30 +DATE: 2024-03-01_03:36:45 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.5257s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4936s - [COUNTERS] Fortran MEs ( 1 ) : 0.0322s for 8192 events => throughput is 2.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7779s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7373s + [COUNTERS] Fortran MEs ( 1 ) : 0.0406s for 8192 events => throughput is 2.02E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2688s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2366s - [COUNTERS] Fortran MEs ( 1 ) : 0.0322s for 8192 events => throughput is 2.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3888s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3479s + [COUNTERS] Fortran MEs ( 1 ) : 0.0409s for 8192 events => throughput is 2.00E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.2051s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8514s - [COUNTERS] Fortran MEs ( 1 ) : 0.3537s for 90112 events => throughput is 2.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6449s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1962s + [COUNTERS] Fortran MEs ( 1 ) : 0.4487s for 90112 events => throughput is 2.01E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094179780921394] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2903s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2635s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0268s for 8192 events => throughput is 3.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4205s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3860s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0345s for 8192 events => throughput is 2.37E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105688579298537] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.1743s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8796s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2947s for 90112 events => throughput is 3.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6592s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2787s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3805s for 90112 events => throughput is 2.37E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105688579298537) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.132672e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.351307e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.165533e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.338637e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094175850060040] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2584s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2465s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0119s for 8192 events => throughput is 6.88E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3859s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3711s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0148s for 8192 events => throughput is 5.54E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105684763984058] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9906s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8610s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1296s for 90112 events => throughput is 6.95E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4203s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2581s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1622s for 90112 events => throughput is 5.56E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105684763984058) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.797834e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.210465e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.867367e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.317035e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094173652938650] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2496s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2435s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0061s for 8192 events => throughput is 1.34E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3679s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3602s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0078s for 8192 events => throughput is 1.05E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105684048677361] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9271s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8599s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0672s for 90112 events => throughput is 1.34E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3368s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2515s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0853s for 90112 events => throughput is 1.06E+06 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105684048677361) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.382080e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.038889e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.412593e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.040818e+06 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094173652938650] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2471s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2414s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0057s for 8192 events => throughput is 1.43E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3685s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3613s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0073s for 8192 events => throughput is 1.13E+06 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105684048677361] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9222s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8592s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0630s for 90112 events => throughput is 1.43E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3377s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2584s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0793s for 90112 events => throughput is 1.14E+06 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105684048677361) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.495407e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.104729e+06 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.513487e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.124265e+06 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094178213275804] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2466s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2415s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0052s for 8192 events => throughput is 1.58E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3726s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3621s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0105s for 8192 events => throughput is 7.77E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105688407939567] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9147s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8576s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0571s for 90112 events => throughput is 1.58E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.3760s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2629s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1130s for 90112 events => throughput is 7.97E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105688407939567) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.606554e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.591310e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.630447e+06 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.407728e+05 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094184344050284] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.7795s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7789s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.48E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094184344050284) differ by less than 4E-4 (9.761425112664313e-09) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105694586476879] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.7005s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6948s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0057s for 90112 events => throughput is 1.57E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989114) and cpp (47.105694586476879) differ by less than 4E-4 (1.4722471020078842e-08) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.201563e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.986974e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.810580e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.774762e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.802177e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.847890e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.368745e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.422351e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 8034ca9ebb..bef66309f6 100644 --- a/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,4 +1,4 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none @@ -6,37 +6,37 @@ make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' OMP_NUM_THREADS= -DATE: 2024-03-01_19:37:40 +DATE: 2024-03-01_03:37:12 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 2601 events (found 5405 events) - [COUNTERS] PROGRAM TOTAL : 0.5244s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4923s - [COUNTERS] Fortran MEs ( 1 ) : 0.0322s for 8192 events => throughput is 2.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.7917s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7505s + [COUNTERS] Fortran MEs ( 1 ) : 0.0413s for 8192 events => throughput is 1.99E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/avalassi/output_ggtt_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x1_fortran > /tmp/aval [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094184803756640] fbridge_mode=0 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2689s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2362s - [COUNTERS] Fortran MEs ( 1 ) : 0.0326s for 8192 events => throughput is 2.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3956s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s + [COUNTERS] Fortran MEs ( 1 ) : 0.0410s for 8192 events => throughput is 2.00E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/avalassi/output_ggtt_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggtt_x10_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105695279989114] fbridge_mode=0 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.2077s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8533s - [COUNTERS] Fortran MEs ( 1 ) : 0.3545s for 90112 events => throughput is 2.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6496s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1993s + [COUNTERS] Fortran MEs ( 1 ) : 0.4503s for 90112 events => throughput is 2.00E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2956s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2655s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0300s for 8192 events => throughput is 2.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4256s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3880s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0377s for 8192 events => throughput is 2.18E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006634] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.2105s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8806s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3299s for 90112 events => throughput is 2.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6990s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2867s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4123s for 90112 events => throughput is 2.19E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696630006634) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.728057e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.185122e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.785272e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.177902e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186141863901] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2690s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2525s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0165s for 8192 events => throughput is 4.98E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3989s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3779s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0210s for 8192 events => throughput is 3.91E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696630006626] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 1.0454s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8647s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1806s for 90112 events => throughput is 4.99E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.5038s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2721s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2317s for 90112 events => throughput is 3.89E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696630006626) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.062202e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.744718e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.973810e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.796645e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2557s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2457s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0100s for 8192 events => throughput is 8.15E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3789s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3661s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0128s for 8192 events => throughput is 6.40E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9697s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8600s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1097s for 90112 events => throughput is 8.21E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4021s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2588s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1433s for 90112 events => throughput is 6.29E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.485845e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.012402e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.626997e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.056070e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2541s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2449s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0093s for 8192 events => throughput is 8.84E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3770s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3652s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0118s for 8192 events => throughput is 6.94E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9619s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8601s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1018s for 90112 events => throughput is 8.85E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.3844s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2567s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1278s for 90112 events => throughput is 7.05E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.108224e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.957699e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.251964e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.976096e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x1_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.09 [47.094186169585456] fbridge_mode=1 [UNWEIGHT] Wrote 1603 events (found 1608 events) - [COUNTERS] PROGRAM TOTAL : 0.2544s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2450s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 8192 events => throughput is 8.73E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3910s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3720s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0190s for 8192 events => throughput is 4.32E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/16 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggtt_x10 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 47.11 [47.105696663215774] fbridge_mode=1 [UNWEIGHT] Wrote 1744 events (found 1749 events) - [COUNTERS] PROGRAM TOTAL : 0.9643s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8623s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1020s for 90112 events => throughput is 8.83E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.4767s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2708s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2059s for 90112 events => throughput is 4.38E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (47.105695279989114) and cpp (47.105696663215774) differ b OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.699461e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.223304e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.028075e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.269412e+05 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x1_cudacpp > /tmp/avalassi/output_ggtt_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.09 [47.094184798437830] fbridge_mode=1 + [UNWEIGHT] Wrote 1603 events (found 1608 events) + [COUNTERS] PROGRAM TOTAL : 0.7843s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7837s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0006s for 8192 events => throughput is 1.37E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.094184803756640) and cpp (47.094184798437830) differ by less than 2E-4 (1.1293987967064822e-10) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggtt_x10_cudacpp > /tmp/avalassi/output_ggtt_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/16 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 47.11 [47.105695279068492] fbridge_mode=1 + [UNWEIGHT] Wrote 1744 events (found 1749 events) + [COUNTERS] PROGRAM TOTAL : 1.6837s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6773s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0064s for 90112 events => throughput is 1.40E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (47.105695279989114) and cpp (47.105695279068492) differ by less than 2E-4 (1.954369999168648e-11) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.090244e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.672934e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.997070e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.055834e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.991192e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.134835e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.012024e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.999333e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index 36f4d3601a..cd3823dd45 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none - +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 + make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-03-01_19:37:50 +DATE: 2024-03-01_03:37:39 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.4918s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2352s - [COUNTERS] Fortran MEs ( 1 ) : 0.2567s for 8192 events => throughput is 3.19E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6990s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3830s + [COUNTERS] Fortran MEs ( 1 ) : 0.3160s for 8192 events => throughput is 2.59E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4671s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2109s - [COUNTERS] Fortran MEs ( 1 ) : 0.2562s for 8192 events => throughput is 3.20E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6286s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3119s + [COUNTERS] Fortran MEs ( 1 ) : 0.3167s for 8192 events => throughput is 2.59E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.8468s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0281s - [COUNTERS] Fortran MEs ( 1 ) : 2.8187s for 90112 events => throughput is 3.20E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.9846s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4835s + [COUNTERS] Fortran MEs ( 1 ) : 3.5010s for 90112 events => throughput is 2.57E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7285s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4659s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2625s for 8192 events => throughput is 3.12E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.0129s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6673s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3457s for 8192 events => throughput is 2.37E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 4.1673s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2870s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.8803s for 90112 events => throughput is 3.13E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3959s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8159s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.5801s for 90112 events => throughput is 2.52E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717666E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.206647e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.608629e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.221936e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.585600e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607748863] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4770s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3418s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1352s for 8192 events => throughput is 6.06E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6456s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4771s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1686s for 8192 events => throughput is 4.86E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717666E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.6522s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1622s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.4900s for 90112 events => throughput is 6.05E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.4996s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6435s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8561s for 90112 events => throughput is 4.85E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717666E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.254884e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.966202e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.291268e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.937901e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3368s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2730s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0638s for 8192 events => throughput is 1.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4803s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3957s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0846s for 8192 events => throughput is 9.68E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.7929s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0905s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7023s for 90112 events => throughput is 1.28E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5052s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5758s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9294s for 90112 events => throughput is 9.70E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.314146e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.913999e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.312940e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.849874e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3290s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2694s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0596s for 8192 events => throughput is 1.37E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4790s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3971s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0818s for 8192 events => throughput is 1.00E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.7376s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0829s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6547s for 90112 events => throughput is 1.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3870s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5613s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8257s for 90112 events => throughput is 1.09E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.414630e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.099230e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.418410e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.125635e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749110] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.2981s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2548s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0433s for 8192 events => throughput is 1.89E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5283s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4199s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1084s for 8192 events => throughput is 7.55E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.5391s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0679s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4712s for 90112 events => throughput is 1.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.7535s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5980s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1555s for 90112 events => throughput is 7.80E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717722E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.907849e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.774058e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.952413e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.841638e+04 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.7462s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7408s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748607749111) differ by less than 3E-14 (0.0) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238481932717736E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 1.9272s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9044s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.95E+06 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481932717736E-002) differ by less than 3E-14 (2.220446049250313e-16) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.632538e+06 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.097542e+06 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.673182e+06 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.241730e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.666883e+06 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.250394e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.680746e+06 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.758368e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index 24c504a5ca..b22193f403 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none + +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - - make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-03-01_19:38:12 +DATE: 2024-03-01_03:38:22 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.4906s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2344s - [COUNTERS] Fortran MEs ( 1 ) : 0.2562s for 8192 events => throughput is 3.20E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6628s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3454s + [COUNTERS] Fortran MEs ( 1 ) : 0.3174s for 8192 events => throughput is 2.58E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4674s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2112s - [COUNTERS] Fortran MEs ( 1 ) : 0.2562s for 8192 events => throughput is 3.20E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6295s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3130s + [COUNTERS] Fortran MEs ( 1 ) : 0.3165s for 8192 events => throughput is 2.59E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.8381s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0198s - [COUNTERS] Fortran MEs ( 1 ) : 2.8183s for 90112 events => throughput is 3.20E+04 events/s + [COUNTERS] PROGRAM TOTAL : 4.9722s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4831s + [COUNTERS] Fortran MEs ( 1 ) : 3.4891s for 90112 events => throughput is 2.58E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112722621426752] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7110s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4576s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2534s for 8192 events => throughput is 3.23E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9336s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6187s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3149s for 8192 events => throughput is 2.60E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238468310179624E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 4.0696s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2755s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.7941s for 90112 events => throughput is 3.23E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.3385s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8107s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.5278s for 90112 events => throughput is 2.55E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238468310179624E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.313377e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.649087e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.325155e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.678753e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112720710186394] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3591s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2834s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0757s for 8192 events => throughput is 1.08E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4987s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4044s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0944s for 8192 events => throughput is 8.68E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238454786658835E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.9379s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1108s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8271s for 90112 events => throughput is 1.09E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.5977s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5622s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0354s for 90112 events => throughput is 8.70E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238454786658835E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.114703e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.791493e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.120515e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.818254e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112721766950902] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.2750s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2419s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0331s for 8192 events => throughput is 2.47E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4003s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3576s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0427s for 8192 events => throughput is 1.92E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238453735016964E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.4282s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0644s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3638s for 90112 events => throughput is 2.48E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9911s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5156s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4755s for 90112 events => throughput is 1.90E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238453735016964E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.549677e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.915431e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.559529e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.928091e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112721766950902] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.2701s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2394s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0307s for 8192 events => throughput is 2.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3910s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3509s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0400s for 8192 events => throughput is 2.05E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238453735016964E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.3923s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0546s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3377s for 90112 events => throughput is 2.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9540s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5218s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4322s for 90112 events => throughput is 2.09E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238453735016964E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.759664e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.113903e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.772533e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.128293e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112723387847480] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.2548s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2344s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0203s for 8192 events => throughput is 4.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4175s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3658s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0516s for 8192 events => throughput is 1.59E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238464410949921E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.2671s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0435s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2236s for 90112 events => throughput is 4.03E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.0938s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5331s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5607s for 90112 events => throughput is 1.61E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238464410949921E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.036698e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.580486e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.079449e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.544942e+05 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112726034625694] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.7476s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7467s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0008s for 8192 events => throughput is 9.72E+06 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112726034625694) differ by less than 4E-4 (2.2321452152196386e-06) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238473828077680E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 1.9141s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9047s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0094s for 90112 events => throughput is 9.57E+06 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238473828077680E-002) differ by less than 4E-4 (1.0228161673175862e-07) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.317603e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.855249e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.653705e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.471958e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.666794e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.507869e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.515295e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.625829e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index 3b804b916f..994bc4f8f2 100644 --- a/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 -make USEBUILDDIR=1 AVX=avx2 - +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' OMP_NUM_THREADS= -DATE: 2024-03-01_19:38:31 +DATE: 2024-03-01_03:39:00 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 365 events (found 1496 events) - [COUNTERS] PROGRAM TOTAL : 0.4914s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2347s - [COUNTERS] Fortran MEs ( 1 ) : 0.2568s for 8192 events => throughput is 3.19E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6750s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3573s + [COUNTERS] Fortran MEs ( 1 ) : 0.3178s for 8192 events => throughput is 2.58E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/avalassi/output_ggttg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748607749111] fbridge_mode=0 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4663s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2100s - [COUNTERS] Fortran MEs ( 1 ) : 0.2563s for 8192 events => throughput is 3.20E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6388s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3205s + [COUNTERS] Fortran MEs ( 1 ) : 0.3183s for 8192 events => throughput is 2.57E+04 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/avalassi/output_ggttg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttg_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238481932717722E-002] fbridge_mode=0 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 3.8461s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0203s - [COUNTERS] Fortran MEs ( 1 ) : 2.8258s for 90112 events => throughput is 3.19E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.0099s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5073s + [COUNTERS] Fortran MEs ( 1 ) : 3.5026s for 90112 events => throughput is 2.57E+04 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748700702684] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.7487s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4785s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2702s for 8192 events => throughput is 3.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.9635s + [COUNTERS] Fortran Overhead ( 0 ) : 0.6336s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3299s for 8192 events => throughput is 2.48E+04 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482679400354E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 4.2292s - [COUNTERS] Fortran Overhead ( 0 ) : 1.2873s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.9419s for 90112 events => throughput is 3.06E+04 events/s + [COUNTERS] PROGRAM TOTAL : 5.4154s + [COUNTERS] Fortran Overhead ( 0 ) : 1.7958s + [COUNTERS] CudaCpp MEs ( 2 ) : 3.6196s for 90112 events => throughput is 2.49E+04 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482679400354E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.165447e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.562106e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 3.152487e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.547562e+04 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748702805033] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.4799s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3471s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1328s for 8192 events => throughput is 6.17E+04 events/s + [COUNTERS] PROGRAM TOTAL : 0.6394s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4739s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1654s for 8192 events => throughput is 4.95E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482683055667E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 2.6010s - [COUNTERS] Fortran Overhead ( 0 ) : 1.1530s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.4480s for 90112 events => throughput is 6.22E+04 events/s + [COUNTERS] PROGRAM TOTAL : 3.4743s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6457s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.8285s for 90112 events => throughput is 4.93E+04 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482683055667E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.328779e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.063467e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 6.375772e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 5.051938e+04 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3422s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2786s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0635s for 8192 events => throughput is 1.29E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4771s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3935s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0836s for 8192 events => throughput is 9.80E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.7929s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0938s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6991s for 90112 events => throughput is 1.29E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4752s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5541s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9211s for 90112 events => throughput is 9.78E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347232E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.324352e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.001861e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.329871e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.840887e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748681415580] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3263s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2680s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0583s for 8192 events => throughput is 1.41E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4581s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3846s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0735s for 8192 events => throughput is 1.11E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482534347232E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.7235s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0825s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6410s for 90112 events => throughput is 1.41E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3626s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5499s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8127s for 90112 events => throughput is 1.11E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482534347232E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.436389e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.148207e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.440980e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.152825e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.1011 [0.10112748700265108] fbridge_mode=1 [UNWEIGHT] Wrote 386 events (found 1179 events) - [COUNTERS] PROGRAM TOTAL : 0.3043s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2596s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0447s for 8192 events => throughput is 1.83E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.5403s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4268s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1136s for 8192 events => throughput is 7.21E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 32/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttg_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.07924 [7.9238482666076374E-002] fbridge_mode=1 [UNWEIGHT] Wrote 1898 events (found 1903 events) - [COUNTERS] PROGRAM TOTAL : 1.5642s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0705s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4936s for 90112 events => throughput is 1.83E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.7973s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6097s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1876s for 90112 events => throughput is 7.59E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238482666076374E-002 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.857254e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.609614e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.864849e+05 ) sec^-1 +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.592843e+04 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x1_cudacpp > /tmp/avalassi/output_ggttg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.1011 [0.10112748601943165] fbridge_mode=1 + [UNWEIGHT] Wrote 386 events (found 1179 events) + [COUNTERS] PROGRAM TOTAL : 0.7459s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7405s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0054s for 8192 events => throughput is 1.51E+06 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.10112748607749111) and cpp (0.10112748601943165) differ by less than 2E-4 (5.74121417074025e-10) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttg_x10_cudacpp > /tmp/avalassi/output_ggttg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 32/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.07924 [7.9238481937154381E-002] fbridge_mode=1 + [UNWEIGHT] Wrote 1898 events (found 1903 events) + [COUNTERS] PROGRAM TOTAL : 1.9191s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8964s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0228s for 90112 events => throughput is 3.96E+06 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (7.9238481932717722E-002) and cpp (7.9238481937154381E-002) differ by less than 2E-4 (5.5991211667105745e-11) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.624489e+06 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.862423e+06 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.598562e+06 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.230160e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.604858e+06 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.241022e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.618302e+06 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.712384e+06 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index 10cb20bbb1..455a867420 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 - make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-03-01_19:38:52 +DATE: 2024-03-01_03:39:43 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 3.6842s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2355s - [COUNTERS] Fortran MEs ( 1 ) : 3.4488s for 8192 events => throughput is 2.38E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.5262s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3968s + [COUNTERS] Fortran MEs ( 1 ) : 4.1295s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 3.6715s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2301s - [COUNTERS] Fortran MEs ( 1 ) : 3.4414s for 8192 events => throughput is 2.38E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4601s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3420s + [COUNTERS] Fortran MEs ( 1 ) : 4.1180s for 8192 events => throughput is 1.99E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 39.6093s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4241s - [COUNTERS] Fortran MEs ( 1 ) : 38.1852s for 90112 events => throughput is 2.36E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.7126s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0517s + [COUNTERS] Fortran MEs ( 1 ) : 45.6608s for 90112 events => throughput is 1.97E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102372E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 6.9402s - [COUNTERS] Fortran Overhead ( 0 ) : 3.5336s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.4065s for 8192 events => throughput is 2.40E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.7056s + [COUNTERS] Fortran Overhead ( 0 ) : 4.4601s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.2455s for 8192 events => throughput is 1.93E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451704E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 42.2863s - [COUNTERS] Fortran Overhead ( 0 ) : 4.7174s - [COUNTERS] CudaCpp MEs ( 2 ) : 37.5689s for 90112 events => throughput is 2.40E+03 events/s + [COUNTERS] PROGRAM TOTAL : 53.1561s + [COUNTERS] Fortran Overhead ( 0 ) : 6.1171s + [COUNTERS] CudaCpp MEs ( 2 ) : 47.0390s for 90112 events => throughput is 1.92E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451704E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.479889e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.989312e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.479923e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.975004e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 3.7209s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9393s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.7816s for 8192 events => throughput is 4.60E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7773s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5170s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.2604s for 8192 events => throughput is 3.62E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451701E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 22.6693s - [COUNTERS] Fortran Overhead ( 0 ) : 3.0997s - [COUNTERS] CudaCpp MEs ( 2 ) : 19.5696s for 90112 events => throughput is 4.60E+03 events/s + [COUNTERS] PROGRAM TOTAL : 29.0103s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1559s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.8544s for 90112 events => throughput is 3.63E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451701E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.828014e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.801009e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.821992e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.781734e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.7069s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9577s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7492s for 8192 events => throughput is 1.09E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.2569s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2848s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9721s for 8192 events => throughput is 8.43E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 10.3842s - [COUNTERS] Fortran Overhead ( 0 ) : 2.1372s - [COUNTERS] CudaCpp MEs ( 2 ) : 8.2470s for 90112 events => throughput is 1.09E+04 events/s + [COUNTERS] PROGRAM TOTAL : 13.7501s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9426s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.8075s for 90112 events => throughput is 8.34E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.131656e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.607758e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.130829e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.615061e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.5295s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8703s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6591s for 8192 events => throughput is 1.24E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.0290s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1707s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8583s for 8192 events => throughput is 9.54E+03 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 9.2923s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0408s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.2515s for 90112 events => throughput is 1.24E+04 events/s + [COUNTERS] PROGRAM TOTAL : 12.2922s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8302s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4620s for 90112 events => throughput is 9.52E+03 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.282050e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.701965e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.280247e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.814187e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.0471s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6354s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4117s for 8192 events => throughput is 1.99E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.5040s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4024s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1017s for 8192 events => throughput is 7.44E+03 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 6.3290s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8022s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.5269s for 90112 events => throughput is 1.99E+04 events/s + [COUNTERS] PROGRAM TOTAL : 14.7910s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0412s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.7498s for 90112 events => throughput is 7.67E+03 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.029838e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.831586e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.031463e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.821061e+03 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703612510102367E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 0.8686s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8368s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0319s for 8192 events => throughput is 2.57E+05 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612510102367E-004) differ by less than 3E-14 (2.220446049250313e-16) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 2.8233s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4732s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3501s for 90112 events => throughput is 2.57E+05 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642451712E-004) differ by less than 3E-14 (0.0) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.280922e+05 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.518844e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.106750e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.162850e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.106625e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.168282e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.107369e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.430988e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index cf2dcec7e0..5e945a4db8 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none + make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - - make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-03-01_19:41:40 +DATE: 2024-03-01_03:43:57 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 3.6839s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2360s - [COUNTERS] Fortran MEs ( 1 ) : 3.4480s for 8192 events => throughput is 2.38E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4989s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3403s + [COUNTERS] Fortran MEs ( 1 ) : 4.1586s for 8192 events => throughput is 1.97E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 3.6716s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2300s - [COUNTERS] Fortran MEs ( 1 ) : 3.4416s for 8192 events => throughput is 2.38E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4650s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3366s + [COUNTERS] Fortran MEs ( 1 ) : 4.1284s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 39.2771s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4084s - [COUNTERS] Fortran MEs ( 1 ) : 37.8687s for 90112 events => throughput is 2.38E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.5707s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0343s + [COUNTERS] Fortran MEs ( 1 ) : 45.5364s for 90112 events => throughput is 1.98E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703729438336302E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 6.7808s - [COUNTERS] Fortran Overhead ( 0 ) : 3.4578s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.3230s for 8192 events => throughput is 2.47E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.4568s + [COUNTERS] Fortran Overhead ( 0 ) : 4.3045s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.1523s for 8192 events => throughput is 1.97E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793486626492658E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 41.2049s - [COUNTERS] Fortran Overhead ( 0 ) : 4.6268s - [COUNTERS] CudaCpp MEs ( 2 ) : 36.5781s for 90112 events => throughput is 2.46E+03 events/s + [COUNTERS] PROGRAM TOTAL : 51.1261s + [COUNTERS] Fortran Overhead ( 0 ) : 5.9844s + [COUNTERS] CudaCpp MEs ( 2 ) : 45.1417s for 90112 events => throughput is 2.00E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793486626492658E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.540625e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.070377e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.542609e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.032691e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703722581317850E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.8927s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0485s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.8442s for 8192 events => throughput is 9.70E+03 events/s + [COUNTERS] PROGRAM TOTAL : 2.5531s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4379s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.1153s for 8192 events => throughput is 7.35E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793483759856148E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 11.5324s - [COUNTERS] Fortran Overhead ( 0 ) : 2.2190s - [COUNTERS] CudaCpp MEs ( 2 ) : 9.3134s for 90112 events => throughput is 9.68E+03 events/s + [COUNTERS] PROGRAM TOTAL : 15.4011s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1124s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.2887s for 90112 events => throughput is 7.33E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483759856148E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.994672e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.468143e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.997533e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.493623e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703722425602170E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.9673s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5920s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3753s for 8192 events => throughput is 2.18E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.3122s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8184s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4938s for 8192 events => throughput is 1.66E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793483698376133E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 5.9096s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7724s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.1372s for 90112 events => throughput is 2.18E+04 events/s + [COUNTERS] PROGRAM TOTAL : 7.8863s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4589s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4274s for 90112 events => throughput is 1.66E+04 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483698376133E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.262354e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.689224e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.259720e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.712522e+04 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703722425602170E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.8878s - [COUNTERS] Fortran Overhead ( 0 ) : 0.5548s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3330s for 8192 events => throughput is 2.46E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.1887s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7547s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4340s for 8192 events => throughput is 1.89E+04 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793483698376133E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 5.3953s - [COUNTERS] Fortran Overhead ( 0 ) : 1.7248s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.6705s for 90112 events => throughput is 2.46E+04 events/s + [COUNTERS] PROGRAM TOTAL : 7.2113s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4166s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.7946s for 90112 events => throughput is 1.88E+04 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793483698376133E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.532533e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.812765e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.533837e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.800388e+04 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703728658657426E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 0.6309s - [COUNTERS] Fortran Overhead ( 0 ) : 0.4278s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2032s for 8192 events => throughput is 4.03E+04 events/s + [COUNTERS] PROGRAM TOTAL : 1.4119s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8827s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.5292s for 8192 events => throughput is 1.55E+04 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793486977281547E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 3.8256s - [COUNTERS] Fortran Overhead ( 0 ) : 1.5943s - [COUNTERS] CudaCpp MEs ( 2 ) : 2.2313s for 90112 events => throughput is 4.04E+04 events/s + [COUNTERS] PROGRAM TOTAL : 8.3753s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5229s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.8525s for 90112 events => throughput is 1.54E+04 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793486977281547E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.132811e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.556546e+04 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.135906e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.565832e+04 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703736267486325E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 0.8334s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8120s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0214s for 8192 events => throughput is 3.82E+05 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703736267486325E-004) differ by less than 4E-4 (3.1975667371675343e-06) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793489323670813E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 2.7017s + [COUNTERS] Fortran Overhead ( 0 ) : 2.4654s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2363s for 90112 events => throughput is 3.81E+05 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793489323670813E-004) differ by less than 4E-4 (3.20900471706409e-06) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.592263e+05 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.940482e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.499807e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.638317e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.497540e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.635301e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.483569e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.518477e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index d3fac22074..4a1ef98d00 100644 --- a/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none - -make USEBUILDDIR=1 AVX=512y +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' OMP_NUM_THREADS= -DATE: 2024-03-01_19:43:57 +DATE: 2024-03-01_03:47:17 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 62 events (found 950 events) - [COUNTERS] PROGRAM TOTAL : 3.6772s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2339s - [COUNTERS] Fortran MEs ( 1 ) : 3.4433s for 8192 events => throughput is 2.38E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4720s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3418s + [COUNTERS] Fortran MEs ( 1 ) : 4.1302s for 8192 events => throughput is 1.98E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/avalassi/output_ggttgg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x1_fortran > /tmp/av [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612510102356E-004] fbridge_mode=0 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 3.6711s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2305s - [COUNTERS] Fortran MEs ( 1 ) : 3.4406s for 8192 events => throughput is 2.38E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.4586s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3357s + [COUNTERS] Fortran MEs ( 1 ) : 4.1229s for 8192 events => throughput is 1.99E+03 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/avalassi/output_ggttgg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttgg_x10_fortran > /tmp/a [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438642451712E-004] fbridge_mode=0 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 39.2987s - [COUNTERS] Fortran Overhead ( 0 ) : 1.4107s - [COUNTERS] Fortran MEs ( 1 ) : 37.8881s for 90112 events => throughput is 2.38E+03 events/s + [COUNTERS] PROGRAM TOTAL : 47.6222s + [COUNTERS] Fortran Overhead ( 0 ) : 2.0494s + [COUNTERS] Fortran MEs ( 1 ) : 45.5728s for 90112 events => throughput is 1.98E+03 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612659176674E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 7.0168s - [COUNTERS] Fortran Overhead ( 0 ) : 3.5696s - [COUNTERS] CudaCpp MEs ( 2 ) : 3.4472s for 8192 events => throughput is 2.38E+03 events/s + [COUNTERS] PROGRAM TOTAL : 8.7912s + [COUNTERS] Fortran Overhead ( 0 ) : 4.5114s + [COUNTERS] CudaCpp MEs ( 2 ) : 4.2799s for 8192 events => throughput is 1.91E+03 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438704534934E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 42.7588s - [COUNTERS] Fortran Overhead ( 0 ) : 4.7366s - [COUNTERS] CudaCpp MEs ( 2 ) : 38.0222s for 90112 events => throughput is 2.37E+03 events/s + [COUNTERS] PROGRAM TOTAL : 53.4090s + [COUNTERS] Fortran Overhead ( 0 ) : 6.1734s + [COUNTERS] CudaCpp MEs ( 2 ) : 47.2356s for 90112 events => throughput is 1.91E+03 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438704534934E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.463868e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.968066e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.464521e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.968245e+03 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612692816703E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 3.7820s - [COUNTERS] Fortran Overhead ( 0 ) : 1.9818s - [COUNTERS] CudaCpp MEs ( 2 ) : 1.8002s for 8192 events => throughput is 4.55E+03 events/s + [COUNTERS] PROGRAM TOTAL : 4.7232s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5040s + [COUNTERS] CudaCpp MEs ( 2 ) : 2.2192s for 8192 events => throughput is 3.69E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438707226035E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 22.9813s - [COUNTERS] Fortran Overhead ( 0 ) : 3.1541s - [COUNTERS] CudaCpp MEs ( 2 ) : 19.8273s for 90112 events => throughput is 4.54E+03 events/s + [COUNTERS] PROGRAM TOTAL : 28.6711s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1739s + [COUNTERS] CudaCpp MEs ( 2 ) : 24.4972s for 90112 events => throughput is 3.68E+03 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438707226035E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.685449e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.727620e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.685208e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.685802e+03 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.6443s - [COUNTERS] Fortran Overhead ( 0 ) : 0.9281s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.7163s for 8192 events => throughput is 1.14E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.2625s + [COUNTERS] Fortran Overhead ( 0 ) : 1.2738s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.9887s for 8192 events => throughput is 8.29E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 9.9868s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0991s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.8878s for 90112 events => throughput is 1.14E+04 events/s + [COUNTERS] PROGRAM TOTAL : 13.6031s + [COUNTERS] Fortran Overhead ( 0 ) : 2.9396s + [COUNTERS] CudaCpp MEs ( 2 ) : 10.6635s for 90112 events => throughput is 8.45E+03 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.175286e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.715236e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.174859e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.685374e+03 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.5228s - [COUNTERS] Fortran Overhead ( 0 ) : 0.8674s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6553s for 8192 events => throughput is 1.25E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.0253s + [COUNTERS] Fortran Overhead ( 0 ) : 1.1676s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8577s for 8192 events => throughput is 9.55E+03 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 9.2502s - [COUNTERS] Fortran Overhead ( 0 ) : 2.0335s - [COUNTERS] CudaCpp MEs ( 2 ) : 7.2167s for 90112 events => throughput is 1.25E+04 events/s + [COUNTERS] PROGRAM TOTAL : 12.2295s + [COUNTERS] Fortran Overhead ( 0 ) : 2.8222s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.4074s for 90112 events => throughput is 9.58E+03 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.286275e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.886999e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.287072e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.910216e+03 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.000387 [3.8703612675240517E-004] fbridge_mode=1 [UNWEIGHT] Wrote 121 events (found 923 events) - [COUNTERS] PROGRAM TOTAL : 1.0514s - [COUNTERS] Fortran Overhead ( 0 ) : 0.6358s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.4156s for 8192 events => throughput is 1.97E+04 events/s + [COUNTERS] PROGRAM TOTAL : 2.4883s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4086s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0796s for 8192 events => throughput is 7.59E+03 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 64/64 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttgg_x [XSECTION] ChannelId = 2 [XSECTION] Cross section = 0.0001579 [1.5793438703631775E-004] fbridge_mode=1 [UNWEIGHT] Wrote 1361 events (found 1881 events) - [COUNTERS] PROGRAM TOTAL : 6.4259s - [COUNTERS] Fortran Overhead ( 0 ) : 1.8132s - [COUNTERS] CudaCpp MEs ( 2 ) : 4.6127s for 90112 events => throughput is 1.95E+04 events/s + [COUNTERS] PROGRAM TOTAL : 15.1764s + [COUNTERS] Fortran Overhead ( 0 ) : 3.0860s + [COUNTERS] CudaCpp MEs ( 2 ) : 12.0904s for 90112 events => throughput is 7.45E+03 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438703631775E-004 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.016833e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.643781e+03 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.019820e+04 ) sec^-1 +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.679757e+03 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x1_cudacpp > /tmp/avalassi/output_ggttgg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.000387 [3.8703612512203166E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 121 events (found 923 events) + [COUNTERS] PROGRAM TOTAL : 0.8696s + [COUNTERS] Fortran Overhead ( 0 ) : 0.8376s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0321s for 8192 events => throughput is 2.56E+05 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (3.8703612510102356E-004) and cpp (3.8703612512203166E-004) differ by less than 2E-4 (5.427946980773868e-11) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttgg_x10_cudacpp > /tmp/avalassi/output_ggttgg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 64/64 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 2 + [XSECTION] Cross section = 0.0001579 [1.5793438642387717E-004] fbridge_mode=1 + [UNWEIGHT] Wrote 1361 events (found 1881 events) + [COUNTERS] PROGRAM TOTAL : 2.8559s + [COUNTERS] Fortran Overhead ( 0 ) : 2.5071s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3489s for 90112 events => throughput is 2.58E+05 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.5793438642451712E-004) and cpp (1.5793438642387717E-004) differ by less than 2E-4 (4.051980972974434e-12) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.289596e+05 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.528638e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.112086e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.149032e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.114551e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.167728e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.109912e+05 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.430504e+05 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 266d51ffaa..6ba33cd625 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=512y - -make USEBUILDDIR=1 AVX=512z +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make USEBUILDDIR=1 AVX=512y + +make USEBUILDDIR=1 AVX=512z +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-03-01_19:47:22 +DATE: 2024-03-01_03:53:01 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 84.8630s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3251s - [COUNTERS] Fortran MEs ( 1 ) : 84.5379s for 8192 events => throughput is 9.69E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.0689s + [COUNTERS] Fortran Overhead ( 0 ) : 0.5400s + [COUNTERS] Fortran MEs ( 1 ) : 95.5289s for 8192 events => throughput is 8.58E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 84.8695s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3267s - [COUNTERS] Fortran MEs ( 1 ) : 84.5428s for 8192 events => throughput is 9.69E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.2818s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4823s + [COUNTERS] Fortran MEs ( 1 ) : 95.7994s for 8192 events => throughput is 8.55E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 936.4463s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8572s - [COUNTERS] Fortran MEs ( 1 ) : 933.5891s for 90112 events => throughput is 9.65E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1058.3505s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1547s + [COUNTERS] Fortran MEs ( 1 ) : 1054.1958s for 90112 events => throughput is 8.55E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939193E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 177.9314s - [COUNTERS] Fortran Overhead ( 0 ) : 81.6750s - [COUNTERS] CudaCpp MEs ( 2 ) : 96.2564s for 8192 events => throughput is 8.51E+01 events/s + [COUNTERS] PROGRAM TOTAL : 212.3366s + [COUNTERS] Fortran Overhead ( 0 ) : 99.0477s + [COUNTERS] CudaCpp MEs ( 2 ) : 113.2889s for 8192 events => throughput is 7.23E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1143.0884s - [COUNTERS] Fortran Overhead ( 0 ) : 84.1796s - [COUNTERS] CudaCpp MEs ( 2 ) : 1058.9088s for 90112 events => throughput is 8.51E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1356.0370s + [COUNTERS] Fortran Overhead ( 0 ) : 104.1787s + [COUNTERS] CudaCpp MEs ( 2 ) : 1251.8583s for 90112 events => throughput is 7.20E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.008863e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.154156e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.009325e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.197434e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939197E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 93.6752s - [COUNTERS] Fortran Overhead ( 0 ) : 43.4234s - [COUNTERS] CudaCpp MEs ( 2 ) : 50.2518s for 8192 events => throughput is 1.63E+02 events/s + [COUNTERS] PROGRAM TOTAL : 107.3498s + [COUNTERS] Fortran Overhead ( 0 ) : 49.5738s + [COUNTERS] CudaCpp MEs ( 2 ) : 57.7759s for 8192 events => throughput is 1.42E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656017E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 596.3770s - [COUNTERS] Fortran Overhead ( 0 ) : 45.9446s - [COUNTERS] CudaCpp MEs ( 2 ) : 550.4324s for 90112 events => throughput is 1.64E+02 events/s + [COUNTERS] PROGRAM TOTAL : 690.9132s + [COUNTERS] Fortran Overhead ( 0 ) : 53.4647s + [COUNTERS] CudaCpp MEs ( 2 ) : 637.4485s for 90112 events => throughput is 1.41E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656017E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.905031e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.672791e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.906475e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.670748e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 42.5980s - [COUNTERS] Fortran Overhead ( 0 ) : 19.5764s - [COUNTERS] CudaCpp MEs ( 2 ) : 23.0216s for 8192 events => throughput is 3.56E+02 events/s + [COUNTERS] PROGRAM TOTAL : 49.9431s + [COUNTERS] Fortran Overhead ( 0 ) : 23.2154s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.7277s for 8192 events => throughput is 3.06E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 276.4895s - [COUNTERS] Fortran Overhead ( 0 ) : 22.0996s - [COUNTERS] CudaCpp MEs ( 2 ) : 254.3899s for 90112 events => throughput is 3.54E+02 events/s + [COUNTERS] PROGRAM TOTAL : 318.2044s + [COUNTERS] Fortran Overhead ( 0 ) : 26.8024s + [COUNTERS] CudaCpp MEs ( 2 ) : 291.4019s for 90112 events => throughput is 3.09E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.257001e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.618074e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.254807e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.618894e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 37.6513s - [COUNTERS] Fortran Overhead ( 0 ) : 17.0538s - [COUNTERS] CudaCpp MEs ( 2 ) : 20.5975s for 8192 events => throughput is 3.98E+02 events/s + [COUNTERS] PROGRAM TOTAL : 44.2064s + [COUNTERS] Fortran Overhead ( 0 ) : 20.3467s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.8597s for 8192 events => throughput is 3.43E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 246.2869s - [COUNTERS] Fortran Overhead ( 0 ) : 19.5567s - [COUNTERS] CudaCpp MEs ( 2 ) : 226.7302s for 90112 events => throughput is 3.97E+02 events/s + [COUNTERS] PROGRAM TOTAL : 291.1048s + [COUNTERS] Fortran Overhead ( 0 ) : 24.2318s + [COUNTERS] CudaCpp MEs ( 2 ) : 266.8729s for 90112 events => throughput is 3.38E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.902541e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.097914e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.907357e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.125731e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939191E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 21.2887s - [COUNTERS] Fortran Overhead ( 0 ) : 9.9280s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.3607s for 8192 events => throughput is 7.21E+02 events/s + [COUNTERS] PROGRAM TOTAL : 45.8566s + [COUNTERS] Fortran Overhead ( 0 ) : 22.2857s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.5710s for 8192 events => throughput is 3.48E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086656014E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 136.9333s - [COUNTERS] Fortran Overhead ( 0 ) : 12.3995s - [COUNTERS] CudaCpp MEs ( 2 ) : 124.5338s for 90112 events => throughput is 7.24E+02 events/s + [COUNTERS] PROGRAM TOTAL : 285.1342s + [COUNTERS] Fortran Overhead ( 0 ) : 26.2120s + [COUNTERS] CudaCpp MEs ( 2 ) : 258.9222s for 90112 events => throughput is 3.48E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656014E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.582235e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.725410e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.580960e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.772387e+02 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.24e-06 [1.2403985227939195E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 4.2510s + [COUNTERS] Fortran Overhead ( 0 ) : 3.1660s + [COUNTERS] CudaCpp MEs ( 2 ) : 1.0850s for 8192 events => throughput is 7.55E+03 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985227939195E-006) differ by less than 3E-14 (1.7763568394002505e-15) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.332e-07 [2.3322993086656006E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 18.8198s + [COUNTERS] Fortran Overhead ( 0 ) : 6.9183s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9015s for 90112 events => throughput is 7.57E+03 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993086656006E-007) differ by less than 3E-14 (1.7763568394002505e-15) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.527080e+03 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.239391e+03 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.271267e+03 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.600243e+03 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.245889e+03 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.476521e+03 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.229131e+03 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.234312e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 0c152a6098..2b7ca2c190 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 - make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-03-01_20:57:42 +DATE: 2024-03-01_05:18:49 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 84.8831s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3259s - [COUNTERS] Fortran MEs ( 1 ) : 84.5572s for 8192 events => throughput is 9.69E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.8320s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4739s + [COUNTERS] Fortran MEs ( 1 ) : 96.3581s for 8192 events => throughput is 8.50E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 85.4196s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3272s - [COUNTERS] Fortran MEs ( 1 ) : 85.0923s for 8192 events => throughput is 9.63E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.1294s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4800s + [COUNTERS] Fortran MEs ( 1 ) : 95.6494s for 8192 events => throughput is 8.56E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 934.8129s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8577s - [COUNTERS] Fortran MEs ( 1 ) : 931.9553s for 90112 events => throughput is 9.67E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1058.3011s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1783s + [COUNTERS] Fortran MEs ( 1 ) : 1054.1228s for 90112 events => throughput is 8.55E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,21 +126,21 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 1.241e-06 [1.2405719950940886E-006] fbridge_mode=1 + [XSECTION] Cross section = 1.241e-06 [1.2405719957040752E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 172.1966s - [COUNTERS] Fortran Overhead ( 0 ) : 79.1644s - [COUNTERS] CudaCpp MEs ( 2 ) : 93.0322s for 8192 events => throughput is 8.81E+01 events/s + [COUNTERS] PROGRAM TOTAL : 197.7089s + [COUNTERS] Fortran Overhead ( 0 ) : 90.3714s + [COUNTERS] CudaCpp MEs ( 2 ) : 107.3375s for 8192 events => throughput is 7.63E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405719950940886E-006) differ by less than 4E-4 (0.00013985206930144933) +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405719957040752E-006) differ by less than 4E-4 (0.00013985256106807675) *** (2-none) Compare MADEVENT_CPP x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** @@ -159,35 +159,35 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE [XSECTION] Configuration = 1 [XSECTION] ChannelId = 1 - [XSECTION] Cross section = 2.333e-07 [2.3326290797495657E-007] fbridge_mode=1 + [XSECTION] Cross section = 2.333e-07 [2.3326290771198648E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1105.6235s - [COUNTERS] Fortran Overhead ( 0 ) : 81.6815s - [COUNTERS] CudaCpp MEs ( 2 ) : 1023.9420s for 90112 events => throughput is 8.80E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1274.0074s + [COUNTERS] Fortran Overhead ( 0 ) : 94.0944s + [COUNTERS] CudaCpp MEs ( 2 ) : 1179.9131s for 90112 events => throughput is 7.64E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** -OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326290797495657E-007) differ by less than 4E-4 (0.0001413931234055532) +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326290771198648E-007) differ by less than 4E-4 (0.00014139199589124907) *** (2-none) Compare MADEVENT_CPP x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.041120e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.108865e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.040723e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.128078e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.241e-06 [1.2405717007921116E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 42.2598s - [COUNTERS] Fortran Overhead ( 0 ) : 19.7765s - [COUNTERS] CudaCpp MEs ( 2 ) : 22.4833s for 8192 events => throughput is 3.64E+02 events/s + [COUNTERS] PROGRAM TOTAL : 49.6519s + [COUNTERS] Fortran Overhead ( 0 ) : 23.3946s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.2573s for 8192 events => throughput is 3.12E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.333e-07 [2.3326284900828787E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 269.6254s - [COUNTERS] Fortran Overhead ( 0 ) : 22.2992s - [COUNTERS] CudaCpp MEs ( 2 ) : 247.3262s for 90112 events => throughput is 3.64E+02 events/s + [COUNTERS] PROGRAM TOTAL : 315.8806s + [COUNTERS] Fortran Overhead ( 0 ) : 27.1593s + [COUNTERS] CudaCpp MEs ( 2 ) : 288.7213s for 90112 events => throughput is 3.12E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326284900828787E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.224551e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.581780e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.222429e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.565199e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.241e-06 [1.2405716659252656E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 21.5804s - [COUNTERS] Fortran Overhead ( 0 ) : 10.0211s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.5593s for 8192 events => throughput is 7.09E+02 events/s + [COUNTERS] PROGRAM TOTAL : 25.4788s + [COUNTERS] Fortran Overhead ( 0 ) : 11.8981s + [COUNTERS] CudaCpp MEs ( 2 ) : 13.5807s for 8192 events => throughput is 6.03E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.333e-07 [2.3326277036840957E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 139.7107s - [COUNTERS] Fortran Overhead ( 0 ) : 12.5414s - [COUNTERS] CudaCpp MEs ( 2 ) : 127.1693s for 90112 events => throughput is 7.09E+02 events/s + [COUNTERS] PROGRAM TOTAL : 165.7549s + [COUNTERS] Fortran Overhead ( 0 ) : 15.4780s + [COUNTERS] CudaCpp MEs ( 2 ) : 150.2769s for 90112 events => throughput is 6.00E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326277036840957E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.465406e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.259920e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.459684e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.259066e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.241e-06 [1.2405716659252656E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 19.0276s - [COUNTERS] Fortran Overhead ( 0 ) : 8.7312s - [COUNTERS] CudaCpp MEs ( 2 ) : 10.2964s for 8192 events => throughput is 7.96E+02 events/s + [COUNTERS] PROGRAM TOTAL : 22.3180s + [COUNTERS] Fortran Overhead ( 0 ) : 10.3786s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.9393s for 8192 events => throughput is 6.86E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.333e-07 [2.3326277036840957E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 124.5091s - [COUNTERS] Fortran Overhead ( 0 ) : 11.2582s - [COUNTERS] CudaCpp MEs ( 2 ) : 113.2509s for 90112 events => throughput is 7.96E+02 events/s + [COUNTERS] PROGRAM TOTAL : 145.4310s + [COUNTERS] Fortran Overhead ( 0 ) : 14.1732s + [COUNTERS] CudaCpp MEs ( 2 ) : 131.2578s for 90112 events => throughput is 6.87E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326277036840957E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.781035e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.296906e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.759033e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.301383e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.241e-06 [1.2405719306052570E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 10.7428s - [COUNTERS] Fortran Overhead ( 0 ) : 5.1100s - [COUNTERS] CudaCpp MEs ( 2 ) : 5.6329s for 8192 events => throughput is 1.45E+03 events/s + [COUNTERS] PROGRAM TOTAL : 23.0558s + [COUNTERS] Fortran Overhead ( 0 ) : 11.3644s + [COUNTERS] CudaCpp MEs ( 2 ) : 11.6914s for 8192 events => throughput is 7.01E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.333e-07 [2.3326283660088769E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 69.5332s - [COUNTERS] Fortran Overhead ( 0 ) : 7.6179s - [COUNTERS] CudaCpp MEs ( 2 ) : 61.9153s for 90112 events => throughput is 1.46E+03 events/s + [COUNTERS] PROGRAM TOTAL : 144.1559s + [COUNTERS] Fortran Overhead ( 0 ) : 15.2893s + [COUNTERS] CudaCpp MEs ( 2 ) : 128.8666s for 90112 events => throughput is 6.99E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326283660088769E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.717335e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.554413e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.719770e+03 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 7.557969e+02 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.241e-06 [1.2405722175509512E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 2.4934s + [COUNTERS] Fortran Overhead ( 0 ) : 1.9950s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4985s for 8192 events => throughput is 1.64E+04 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2405722175509512E-006) differ by less than 4E-4 (0.00014003141235829908) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.333e-07 [2.3326296967941821E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 11.1120s + [COUNTERS] Fortran Overhead ( 0 ) : 5.7089s + [COUNTERS] CudaCpp MEs ( 2 ) : 5.4031s for 90112 events => throughput is 1.67E+04 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3326296967941821E-007) differ by less than 4E-4 (0.0001416576883412901) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.650610e+04 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.632591e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.339184e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.373598e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.323596e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.361104e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.325481e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.425348e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 986ca889e8..99d7cfbcd5 100644 --- a/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg CUDACPP_BUILDDIR='.' + make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 + +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y make USEBUILDDIR=1 AVX=512z - - -make USEBUILDDIR=1 AVX=sse4 -make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' OMP_NUM_THREADS= -DATE: 2024-03-01_21:53:07 +DATE: 2024-03-01_06:24:34 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 1 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 85.2035s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3266s - [COUNTERS] Fortran MEs ( 1 ) : 84.8769s for 8192 events => throughput is 9.65E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.2156s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4799s + [COUNTERS] Fortran MEs ( 1 ) : 95.7357s for 8192 events => throughput is 8.56E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/avalassi/output_ggttggg_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x1_fortran > /tmp/a [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985227939174E-006] fbridge_mode=0 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 84.8113s - [COUNTERS] Fortran Overhead ( 0 ) : 0.3270s - [COUNTERS] Fortran MEs ( 1 ) : 84.4843s for 8192 events => throughput is 9.70E+01 events/s + [COUNTERS] PROGRAM TOTAL : 96.1318s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4799s + [COUNTERS] Fortran MEs ( 1 ) : 95.6519s for 8192 events => throughput is 8.56E+01 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/avalassi/output_ggttggg_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_ggttggg_x10_fortran > /tmp/ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993086655967E-007] fbridge_mode=0 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 937.5305s - [COUNTERS] Fortran Overhead ( 0 ) : 2.8561s - [COUNTERS] Fortran MEs ( 1 ) : 934.6744s for 90112 events => throughput is 9.64E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1057.5728s + [COUNTERS] Fortran Overhead ( 0 ) : 4.1537s + [COUNTERS] Fortran MEs ( 1 ) : 1053.4191s for 90112 events => throughput is 8.55E+01 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985299359844E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 180.7446s - [COUNTERS] Fortran Overhead ( 0 ) : 83.3239s - [COUNTERS] CudaCpp MEs ( 2 ) : 97.4208s for 8192 events => throughput is 8.41E+01 events/s + [COUNTERS] PROGRAM TOTAL : 220.4361s + [COUNTERS] Fortran Overhead ( 0 ) : 102.4490s + [COUNTERS] CudaCpp MEs ( 2 ) : 117.9870s for 8192 events => throughput is 6.94E+01 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993212353001E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 1158.9873s - [COUNTERS] Fortran Overhead ( 0 ) : 85.8624s - [COUNTERS] CudaCpp MEs ( 2 ) : 1073.1249s for 90112 events => throughput is 8.40E+01 events/s + [COUNTERS] PROGRAM TOTAL : 1422.8276s + [COUNTERS] Fortran Overhead ( 0 ) : 106.0198s + [COUNTERS] CudaCpp MEs ( 2 ) : 1316.8079s for 90112 events => throughput is 6.84E+01 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993212353001E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.891814e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.035940e+01 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.885638e+01 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 8.018960e+01 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985295828471E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 90.4857s - [COUNTERS] Fortran Overhead ( 0 ) : 41.7615s - [COUNTERS] CudaCpp MEs ( 2 ) : 48.7242s for 8192 events => throughput is 1.68E+02 events/s + [COUNTERS] PROGRAM TOTAL : 110.5022s + [COUNTERS] Fortran Overhead ( 0 ) : 50.8167s + [COUNTERS] CudaCpp MEs ( 2 ) : 59.6855s for 8192 events => throughput is 1.37E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222645653E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 580.2521s - [COUNTERS] Fortran Overhead ( 0 ) : 44.3181s - [COUNTERS] CudaCpp MEs ( 2 ) : 535.9340s for 90112 events => throughput is 1.68E+02 events/s + [COUNTERS] PROGRAM TOTAL : 715.3882s + [COUNTERS] Fortran Overhead ( 0 ) : 54.5501s + [COUNTERS] CudaCpp MEs ( 2 ) : 660.8381s for 90112 events => throughput is 1.36E+02 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222645653E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.980238e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.628879e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.979425e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.636164e+02 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 40.8990s - [COUNTERS] Fortran Overhead ( 0 ) : 18.5391s - [COUNTERS] CudaCpp MEs ( 2 ) : 22.3599s for 8192 events => throughput is 3.66E+02 events/s + [COUNTERS] PROGRAM TOTAL : 48.5744s + [COUNTERS] Fortran Overhead ( 0 ) : 22.1801s + [COUNTERS] CudaCpp MEs ( 2 ) : 26.3943s for 8192 events => throughput is 3.10E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 267.2159s - [COUNTERS] Fortran Overhead ( 0 ) : 21.0817s - [COUNTERS] CudaCpp MEs ( 2 ) : 246.1343s for 90112 events => throughput is 3.66E+02 events/s + [COUNTERS] PROGRAM TOTAL : 319.2663s + [COUNTERS] Fortran Overhead ( 0 ) : 26.0078s + [COUNTERS] CudaCpp MEs ( 2 ) : 293.2585s for 90112 events => throughput is 3.07E+02 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.500304e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.764546e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.493152e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.773101e+02 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 36.1935s - [COUNTERS] Fortran Overhead ( 0 ) : 16.2965s - [COUNTERS] CudaCpp MEs ( 2 ) : 19.8969s for 8192 events => throughput is 4.12E+02 events/s + [COUNTERS] PROGRAM TOTAL : 42.4540s + [COUNTERS] Fortran Overhead ( 0 ) : 19.2743s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.1797s for 8192 events => throughput is 3.53E+02 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 237.7664s - [COUNTERS] Fortran Overhead ( 0 ) : 18.8191s - [COUNTERS] CudaCpp MEs ( 2 ) : 218.9473s for 90112 events => throughput is 4.12E+02 events/s + [COUNTERS] PROGRAM TOTAL : 277.3470s + [COUNTERS] Fortran Overhead ( 0 ) : 22.9193s + [COUNTERS] CudaCpp MEs ( 2 ) : 254.4277s for 90112 events => throughput is 3.54E+02 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.129109e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.384820e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.122337e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.391539e+02 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 1.24e-06 [1.2403985293629285E-006] fbridge_mode=1 [UNWEIGHT] Wrote 70 events (found 407 events) - [COUNTERS] PROGRAM TOTAL : 20.7508s - [COUNTERS] Fortran Overhead ( 0 ) : 9.5720s - [COUNTERS] CudaCpp MEs ( 2 ) : 11.1788s for 8192 events => throughput is 7.33E+02 events/s + [COUNTERS] PROGRAM TOTAL : 45.2143s + [COUNTERS] Fortran Overhead ( 0 ) : 21.9553s + [COUNTERS] CudaCpp MEs ( 2 ) : 23.2589s for 8192 events => throughput is 3.52E+02 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 128/128 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_ggttggg_ [XSECTION] ChannelId = 1 [XSECTION] Cross section = 2.332e-07 [2.3322993222447204E-007] fbridge_mode=1 [UNWEIGHT] Wrote 303 events (found 1531 events) - [COUNTERS] PROGRAM TOTAL : 133.3425s - [COUNTERS] Fortran Overhead ( 0 ) : 12.0919s - [COUNTERS] CudaCpp MEs ( 2 ) : 121.2506s for 90112 events => throughput is 7.43E+02 events/s + [COUNTERS] PROGRAM TOTAL : 278.0679s + [COUNTERS] Fortran Overhead ( 0 ) : 25.4000s + [COUNTERS] CudaCpp MEs ( 2 ) : 252.6680s for 90112 events => throughput is 3.57E+02 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993222447204E-007 OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.875881e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.828727e+02 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.878334e+02 ) sec^-1 +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.858416e+02 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x1_cudacpp > /tmp/avalassi/output_ggttggg_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 1.24e-06 [1.2403985217419736E-006] fbridge_mode=1 + [UNWEIGHT] Wrote 70 events (found 407 events) + [COUNTERS] PROGRAM TOTAL : 3.5884s + [COUNTERS] Fortran Overhead ( 0 ) : 2.7239s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8645s for 8192 events => throughput is 9.48E+03 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (1.2403985227939174E-006) and cpp (1.2403985217419736E-006) differ by less than 2E-4 (8.480691704448873e-10) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_ggttggg_x10_cudacpp > /tmp/avalassi/output_ggttggg_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 128/128 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 2.332e-07 [2.3322993078576733E-007] fbridge_mode=1 + [UNWEIGHT] Wrote 303 events (found 1531 events) + [COUNTERS] PROGRAM TOTAL : 15.9902s + [COUNTERS] Fortran Overhead ( 0 ) : 6.4881s + [COUNTERS] CudaCpp MEs ( 2 ) : 9.5020s for 90112 events => throughput is 9.48E+03 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (2.3322993086655967E-007) and cpp (2.3322993078576733E-007) differ by less than 2E-4 (3.464063480507207e-10) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 9.411937e+03 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.083264e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.112113e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 512 32 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.161038e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.111465e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 128 128 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.105445e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 --bridge *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.112837e+04 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 2048 8 1 *** +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.656493e+03 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index b84371ad1d..8e9ad5ba7a 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=none +make USEBUILDDIR=1 AVX=sse4 +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512y -make USEBUILDDIR=1 AVX=sse4 -make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-03-01_19:46:46 +DATE: 2024-03-01_03:51:32 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.3129s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2585s - [COUNTERS] Fortran MEs ( 1 ) : 0.0543s for 8192 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4944s + [COUNTERS] Fortran Overhead ( 0 ) : 0.4247s + [COUNTERS] Fortran MEs ( 1 ) : 0.0697s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2739s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2195s - [COUNTERS] Fortran MEs ( 1 ) : 0.0544s for 8192 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3864s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3169s + [COUNTERS] Fortran MEs ( 1 ) : 0.0696s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6236s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0286s - [COUNTERS] Fortran MEs ( 1 ) : 0.5950s for 90112 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2522s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4897s + [COUNTERS] Fortran MEs ( 1 ) : 0.7625s for 90112 events => throughput is 1.18E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263335] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3360s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2765s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0595s for 8192 events => throughput is 1.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4681s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3922s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0759s for 8192 events => throughput is 1.08E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561293] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7527s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0953s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6574s for 90112 events => throughput is 1.37E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3698s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5419s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8279s for 90112 events => throughput is 1.09E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561293) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.383515e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.084897e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.408034e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.103096e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351262530] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2799s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2477s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0322s for 8192 events => throughput is 2.54E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4004s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3592s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0412s for 8192 events => throughput is 1.99E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561281] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.4221s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0684s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3537s for 90112 events => throughput is 2.55E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9658s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5172s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4486s for 90112 events => throughput is 2.01E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561281) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.533732e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.019219e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.596028e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.018294e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2515s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2340s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0175s for 8192 events => throughput is 4.68E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3643s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3408s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0234s for 8192 events => throughput is 3.49E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.2454s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0534s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1920s for 90112 events => throughput is 4.69E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7585s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4994s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2591s for 90112 events => throughput is 3.48E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.786503e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.297018e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.821003e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.427747e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2476s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2309s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0167s for 8192 events => throughput is 4.91E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3848s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3623s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0225s for 8192 events => throughput is 3.65E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.2377s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0550s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1827s for 90112 events => throughput is 4.93E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8132s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5645s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2486s for 90112 events => throughput is 3.62E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.024302e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.905513e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.056314e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.866043e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263341] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2452s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2307s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0145s for 8192 events => throughput is 5.65E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3815s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3495s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0319s for 8192 events => throughput is 2.56E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_d_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.2125s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0534s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1591s for 90112 events => throughput is 5.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.8893s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5364s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3529s for 90112 events => throughput is 2.55E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561295) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.561724e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.640953e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.701897e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.543334e+05 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539351263363] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.7465s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7458s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.18E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539351263363) differ by less than 3E-14 (1.3322676295501878e-15) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_d_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686556561304] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.9068s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8993s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.20E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686556561304) differ by less than 3E-14 (4.440892098500626e-16) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.589846e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.058801e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.383441e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.512285e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.382616e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.771039e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.376307e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.776386e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index f8f26accf9..63166c80e0 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' -make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=512y - -make USEBUILDDIR=1 AVX=512z +make USEBUILDDIR=1 AVX=none make USEBUILDDIR=1 AVX=sse4 make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make USEBUILDDIR=1 AVX=512y + +make USEBUILDDIR=1 AVX=512z +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' -CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' +CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-03-01_19:46:58 +DATE: 2024-03-01_03:52:02 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.3136s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2586s - [COUNTERS] Fortran MEs ( 1 ) : 0.0550s for 8192 events => throughput is 1.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4536s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3837s + [COUNTERS] Fortran MEs ( 1 ) : 0.0700s for 8192 events => throughput is 1.17E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2674s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2131s - [COUNTERS] Fortran MEs ( 1 ) : 0.0544s for 8192 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3907s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3210s + [COUNTERS] Fortran MEs ( 1 ) : 0.0696s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6288s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0330s - [COUNTERS] Fortran MEs ( 1 ) : 0.5958s for 90112 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2714s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5019s + [COUNTERS] Fortran MEs ( 1 ) : 0.7695s for 90112 events => throughput is 1.17E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110463093540638] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3315s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2765s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0550s for 8192 events => throughput is 1.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4586s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3882s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0704s for 8192 events => throughput is 1.16E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686273216112] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6971s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0917s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6054s for 90112 events => throughput is 1.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.3150s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5373s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.7777s for 90112 events => throughput is 1.16E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686273216112) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.519656e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.170698e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.508595e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.161745e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110459152958460] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2550s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2350s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0200s for 8192 events => throughput is 4.10E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3657s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3405s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0252s for 8192 events => throughput is 3.25E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510683016166510] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.2758s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0569s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.2188s for 90112 events => throughput is 4.12E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7697s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4943s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2753s for 90112 events => throughput is 3.27E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510683016166510) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.011797e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.219045e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.085942e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.229652e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110460595003461] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2343s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2249s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0095s for 8192 events => throughput is 8.66E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3421s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3299s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0122s for 8192 events => throughput is 6.71E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510682502089912] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.1488s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0448s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1040s for 90112 events => throughput is 8.67E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6208s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4844s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1364s for 90112 events => throughput is 6.61E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510682502089912) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.884332e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.431027e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 8.982873e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.412727e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110460595003461] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2332s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2243s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0090s for 8192 events => throughput is 9.13E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3415s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3300s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0116s for 8192 events => throughput is 7.08E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510682502089912] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.1470s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0443s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1027s for 90112 events => throughput is 8.77E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.6084s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4831s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1253s for 90112 events => throughput is 7.19E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510682502089912) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.431807e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.891581e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 9.514860e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.928440e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110464176080312] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2312s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2239s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0073s for 8192 events => throughput is 1.12E+06 events/s + [COUNTERS] PROGRAM TOTAL : 0.3483s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3329s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.33E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_f_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510685411522326] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.1246s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0440s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0806s for 90112 events => throughput is 1.12E+06 events/s + [COUNTERS] PROGRAM TOTAL : 1.6561s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4840s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.1721s for 90112 events => throughput is 5.24E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510685411522326) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.130278e+06 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.988554e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.145791e+06 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.962392e+05 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110478167944563] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.7423s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7418s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0005s for 8192 events => throughput is 1.58E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110478167944563) differ by less than 4E-4 (2.2568093527297606e-06) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_f_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510689885789414] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.8968s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8910s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0059s for 90112 events => throughput is 1.53E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510689885789414) differ by less than 4E-4 (1.547708907700951e-07) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.824058e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.473484e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.891145e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.706092e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.798334e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.787777e+08 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.356687e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.028611e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index a0472a3076..eb4ca92d13 100644 --- a/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tmad/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,42 +1,42 @@ -Working directory (build): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +Working directory (build): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu CUDACPP_BUILDDIR='.' make USEBUILDDIR=1 AVX=none -make USEBUILDDIR=1 AVX=512y - +make USEBUILDDIR=1 AVX=avx2 make USEBUILDDIR=1 AVX=sse4 -make USEBUILDDIR=1 AVX=avx2 +make USEBUILDDIR=1 AVX=512y + make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' -CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' -CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' +CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu' OMP_NUM_THREADS= -DATE: 2024-03-01_19:47:09 +DATE: 2024-03-01_03:52:30 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: -Working directory (run): /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: +Working directory (run): /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gu_ttxu *** (1) EXECUTE MADEVENT_FORTRAN (create results.dat) *** -------------------- @@ -51,7 +51,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -59,9 +59,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1817 events) - [COUNTERS] PROGRAM TOTAL : 0.3127s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2578s - [COUNTERS] Fortran MEs ( 1 ) : 0.0549s for 8192 events => throughput is 1.49E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4522s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3828s + [COUNTERS] Fortran MEs ( 1 ) : 0.0694s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x1 (create events.lhe) *** -------------------- @@ -76,7 +76,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/avalassi/output_gqttq_x1_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -84,9 +84,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x1_fortran > /tmp/ava [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539351263330] fbridge_mode=0 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2697s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2153s - [COUNTERS] Fortran MEs ( 1 ) : 0.0544s for 8192 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3858s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3164s + [COUNTERS] Fortran MEs ( 1 ) : 0.0694s for 8192 events => throughput is 1.18E+05 events/s *** (1) EXECUTE MADEVENT_FORTRAN x10 (create events.lhe) *** -------------------- @@ -101,7 +101,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/avalassi/output_gqttq_x10_fortran' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -109,9 +109,9 @@ Executing ' ./madevent_fortran < /tmp/avalassi/input_gqttq_x10_fortran > /tmp/av [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686556561295] fbridge_mode=0 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.6293s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0334s - [COUNTERS] Fortran MEs ( 1 ) : 0.5959s for 90112 events => throughput is 1.51E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.2499s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4895s + [COUNTERS] Fortran MEs ( 1 ) : 0.7604s for 90112 events => throughput is 1.19E+05 events/s *** (2-none) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -126,7 +126,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -134,9 +134,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.3346s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2749s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0596s for 8192 events => throughput is 1.37E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4694s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3943s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0751s for 8192 events => throughput is 1.09E+05 events/s *** (2-none) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -159,7 +159,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -167,9 +167,9 @@ Executing ' ./build.none_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686560794337] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.7529s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0996s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.6533s for 90112 events => throughput is 1.38E+05 events/s + [COUNTERS] PROGRAM TOTAL : 2.4601s + [COUNTERS] Fortran Overhead ( 0 ) : 1.6015s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.8586s for 90112 events => throughput is 1.05E+05 events/s *** (2-none) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -180,14 +180,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686560794337) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.399420e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.100770e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 1.416273e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.090853e+05 ) sec^-1 *** (2-sse4) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -202,7 +202,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -210,9 +210,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539348916002] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2792s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2479s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0314s for 8192 events => throughput is 2.61E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3940s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3546s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0394s for 8192 events => throughput is 2.08E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -235,7 +235,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -243,9 +243,9 @@ Executing ' ./build.sse4_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686560794334] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.4119s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0679s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.3440s for 90112 events => throughput is 2.62E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9359s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5057s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.4302s for 90112 events => throughput is 2.09E+05 events/s *** (2-sse4) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -256,14 +256,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686560794334) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.596110e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.020468e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 2.619319e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.027641e+05 ) sec^-1 *** (2-avx2) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -278,7 +278,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -286,9 +286,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2501s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2330s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0171s for 8192 events => throughput is 4.78E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3636s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3406s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0230s for 8192 events => throughput is 3.56E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -311,7 +311,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -319,9 +319,9 @@ Executing ' ./build.avx2_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.2416s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0536s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1880s for 90112 events => throughput is 4.79E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7468s + [COUNTERS] Fortran Overhead ( 0 ) : 1.4923s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2545s for 90112 events => throughput is 3.54E+05 events/s *** (2-avx2) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -332,14 +332,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.893459e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.536848e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 4.921901e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.536744e+05 ) sec^-1 *** (2-512y) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -354,7 +354,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -362,9 +362,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2482s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2320s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0162s for 8192 events => throughput is 5.06E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.3573s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3372s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0201s for 8192 events => throughput is 4.08E+05 events/s *** (2-512y) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -387,7 +387,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -395,9 +395,9 @@ Executing ' ./build.512y_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.2303s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0527s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1776s for 90112 events => throughput is 5.07E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.7304s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5047s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.2257s for 90112 events => throughput is 3.99E+05 events/s *** (2-512y) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -408,14 +408,14 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.182586e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.887668e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.219883e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 3.834847e+05 ) sec^-1 *** (2-512z) EXECUTE MADEVENT_CPP x1 (create events.lhe) *** -------------------- @@ -430,7 +430,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -438,9 +438,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2711 [0.27110539330272815] fbridge_mode=1 [UNWEIGHT] Wrote 404 events (found 1228 events) - [COUNTERS] PROGRAM TOTAL : 0.2489s - [COUNTERS] Fortran Overhead ( 0 ) : 0.2335s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.0154s for 8192 events => throughput is 5.34E+05 events/s + [COUNTERS] PROGRAM TOTAL : 0.4046s + [COUNTERS] Fortran Overhead ( 0 ) : 0.3689s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0357s for 8192 events => throughput is 2.30E+05 events/s *** (2-512z) Compare MADEVENT_CPP x1 xsec to MADEVENT_FORTRAN xsec *** @@ -463,7 +463,7 @@ CUDACPP_RUNTIME_VECSIZEUSED = 8192 1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) -------------------- Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' - [OPENMPTH] omp_get_max_threads/nproc = 1/32 + [OPENMPTH] omp_get_max_threads/nproc = 1/4 [NGOODHEL] ngoodhel/ncomb = 16/32 [XSECTION] VECSIZE_USED = 8192 [XSECTION] MultiChannel = TRUE @@ -471,9 +471,9 @@ Executing ' ./build.512z_m_inl0_hrd0/madevent_cpp < /tmp/avalassi/input_gqttq_x1 [XSECTION] ChannelId = 1 [XSECTION] Cross section = 0.2151 [0.21510686558551748] fbridge_mode=1 [UNWEIGHT] Wrote 1939 events (found 1944 events) - [COUNTERS] PROGRAM TOTAL : 1.2187s - [COUNTERS] Fortran Overhead ( 0 ) : 1.0503s - [COUNTERS] CudaCpp MEs ( 2 ) : 0.1684s for 90112 events => throughput is 5.35E+05 events/s + [COUNTERS] PROGRAM TOTAL : 1.9542s + [COUNTERS] Fortran Overhead ( 0 ) : 1.5763s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.3779s for 90112 events => throughput is 2.38E+05 events/s *** (2-512z) Compare MADEVENT_CPP x10 xsec to MADEVENT_FORTRAN xsec *** @@ -484,13 +484,119 @@ OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686558551748) differ OK! events.lhe.cpp.10 and events.lhe.ref.10 are identical *** EXECUTE CHECK(8192) -p 256 32 1 --bridge *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.247590e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.510568e+05 ) sec^-1 *** EXECUTE CHECK(8192) -p 256 32 1 *** -Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK -EvtsPerSec[MECalcOnly] (3a) = ( 5.440729e+05 ) sec^-1 +Process = SIGMA_SM_GU_TTXU_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.320811e+05 ) sec^-1 + +*** (3) EXECUTE MADEVENT_CUDA x1 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +8192 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x1_cudacpp > /tmp/avalassi/output_gqttq_x1_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2711 [0.27110539343558537] fbridge_mode=1 + [UNWEIGHT] Wrote 404 events (found 1228 events) + [COUNTERS] PROGRAM TOTAL : 0.7473s + [COUNTERS] Fortran Overhead ( 0 ) : 0.7466s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0007s for 8192 events => throughput is 1.20E+07 events/s + +*** (3) Compare MADEVENT_CUDA x1 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.27110539351263330) and cpp (0.27110539343558537) differ by less than 2E-4 (2.8419910869104115e-10) + +*** (3) Compare MADEVENT_CUDA x1 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.1 and events.lhe.ref.1 are identical + +*** (3) EXECUTE MADEVENT_CUDA x10 (create events.lhe) *** +-------------------- +CUDACPP_RUNTIME_FBRIDGEMODE = (not set) +CUDACPP_RUNTIME_VECSIZEUSED = 8192 +-------------------- +81920 1 1 ! Number of events and max and min iterations +0.000001 ! Accuracy (ignored because max iterations = min iterations) +0 ! Grid Adjustment 0=none, 2=adjust (NB if = 0, ftn26 will still be used if present) +1 ! Suppress Amplitude 1=yes (i.e. use MadEvent single-diagram enhancement) +0 ! Helicity Sum/event 0=exact +1 ! Channel number (1-N) for single-diagram enhancement multi-channel (NB used even if suppress amplitude is 0!) +-------------------- +Executing ' ./build.none_m_inl0_hrd0/madevent_cuda < /tmp/avalassi/input_gqttq_x10_cudacpp > /tmp/avalassi/output_gqttq_x10_cudacpp' + [OPENMPTH] omp_get_max_threads/nproc = 1/4 + [NGOODHEL] ngoodhel/ncomb = 16/32 + [XSECTION] VECSIZE_USED = 8192 + [XSECTION] MultiChannel = TRUE + [XSECTION] Configuration = 1 + [XSECTION] ChannelId = 1 + [XSECTION] Cross section = 0.2151 [0.21510686553631395] fbridge_mode=1 + [UNWEIGHT] Wrote 1939 events (found 1944 events) + [COUNTERS] PROGRAM TOTAL : 1.8944s + [COUNTERS] Fortran Overhead ( 0 ) : 1.8868s + [COUNTERS] CudaCpp MEs ( 2 ) : 0.0075s for 90112 events => throughput is 1.19E+07 events/s + +*** (3) Compare MADEVENT_CUDA x10 xsec to MADEVENT_FORTRAN xsec *** + +OK! xsec from fortran (0.21510686556561295) and cpp (0.21510686553631395) differ by less than 2E-4 (1.3620671257541517e-10) + +*** (3) Compare MADEVENT_CUDA x10 events.lhe to MADEVENT_FORTRAN events.lhe reference (including colors and helicities) *** + +OK! events.lhe.cuda.10 and events.lhe.ref.10 are identical + +*** EXECUTE GCHECK(8192) -p 256 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.579519e+07 ) sec^-1 + +*** EXECUTE GCHECK(8192) -p 256 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 4.134868e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.391789e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX) -p 16384 32 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.511629e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.394001e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX128THR) -p 4096 128 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 6.800973e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 --bridge *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 2.396936e+07 ) sec^-1 + +*** EXECUTE GCHECK(MAX8THR) -p 65536 8 1 *** +Process = SIGMA_SM_GU_TTXU_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +EvtsPerSec[MECalcOnly] (3a) = ( 1.776316e+07 ) sec^-1 TEST COMPLETED diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt index ed28df1cad..baa8c044cd 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:08:08 +DATE: 2024-03-01_02:23:52 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.465816e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.330908e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.240172e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.907657 sec + 2,864,594,511 cycles # 3.017 GHz + 4,419,491,827 instructions # 1.54 insn per cycle + 1.243823060 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.425993e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.645967e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.645967e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.035526 sec - 17,606,144,392 cycles # 3.495 GHz - 44,071,163,755 instructions # 2.50 insn per cycle - 5.038823878 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.117981e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.310106e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.310106e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.029383 sec + 18,345,746,310 cycles # 3.041 GHz + 43,971,705,846 instructions # 2.40 insn per cycle + 6.038464488 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.250731e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.893394e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.893394e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.401156 sec - 11,895,409,777 cycles # 3.495 GHz - 30,996,596,899 instructions # 2.61 insn per cycle - 3.404468118 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.673850e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.186329e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.186329e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.154865 sec + 12,823,382,487 cycles # 3.082 GHz + 30,998,172,347 instructions # 2.42 insn per cycle + 4.171623433 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.710645e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.676510e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.676510e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.897930 sec - 9,801,226,041 cycles # 3.380 GHz - 19,263,940,378 instructions # 1.97 insn per cycle - 2.901252745 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.086690e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.914110e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.914110e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.406763 sec + 10,081,289,557 cycles # 2.955 GHz + 19,366,111,959 instructions # 1.92 insn per cycle + 3.427414790 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.787050e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.808409e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.808409e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.849886 sec - 9,645,321,314 cycles # 3.382 GHz - 18,674,064,614 instructions # 1.94 insn per cycle - 2.853150797 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.191873e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.083636e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.083636e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.257696 sec + 9,685,682,355 cycles # 2.968 GHz + 18,976,171,527 instructions # 1.96 insn per cycle + 3.273948471 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.516175e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.268017e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.268017e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.378272 sec - 8,078,122,389 cycles # 3.393 GHz - 15,420,673,885 instructions # 1.91 insn per cycle - 2.381601605 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.805262e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.408203e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.408203e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.888242 sec + 8,621,851,062 cycles # 2.214 GHz + 15,727,334,662 instructions # 1.82 insn per cycle + 3.905958468 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt index 062460fe0a..b9ff72dbf3 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_bridge.txt @@ -1,188 +1,219 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:28:58 +DATE: 2024-03-01_03:12:58 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.687342e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.551417e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.551417e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.232505 sec + 7,524,955,995 cycles # 3.041 GHz + 13,468,669,108 instructions # 1.79 insn per cycle + 2.532807464 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.384713e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.591202e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.591202e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.251037 sec - 18,383,972,346 cycles # 3.499 GHz - 44,316,778,142 instructions # 2.41 insn per cycle - 5.255195321 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.081573e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.260544e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.260544e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.415532 sec + 19,561,606,037 cycles # 3.046 GHz + 44,198,639,919 instructions # 2.26 insn per cycle + 6.422457347 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.136391e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.710309e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.710309e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.710222 sec - 12,989,118,477 cycles # 3.498 GHz - 31,817,590,346 instructions # 2.45 insn per cycle - 3.714348648 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.552230e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.996603e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.996603e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.664054 sec + 13,997,557,946 cycles # 2.998 GHz + 31,841,279,233 instructions # 2.27 insn per cycle + 4.670791737 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.564211e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.413599e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.413599e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.183000 sec - 10,807,008,900 cycles # 3.392 GHz - 20,602,119,150 instructions # 1.91 insn per cycle - 3.187089748 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.951455e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.660973e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.660973e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.823801 sec + 11,324,833,068 cycles # 2.957 GHz + 20,724,775,427 instructions # 1.83 insn per cycle + 3.830534322 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.621053e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.507982e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.507982e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.143575 sec - 10,677,287,374 cycles # 3.394 GHz - 20,011,376,117 instructions # 1.87 insn per cycle - 3.147703877 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.028218e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.792747e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.792747e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.704930 sec + 10,963,593,820 cycles # 2.954 GHz + 20,347,072,159 instructions # 1.86 insn per cycle + 3.711957869 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.247983e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.684393e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.684393e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.688755 sec - 9,174,309,622 cycles # 3.408 GHz - 16,543,784,332 instructions # 1.80 insn per cycle - 2.692848189 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.747913e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.283053e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.283053e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.214412 sec + 9,956,996,891 cycles # 2.360 GHz + 16,873,658,319 instructions # 1.69 insn per cycle + 4.221168968 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt index bec395b2fc..09aaad1dd8 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_common.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:34:42 +DATE: 2024-03-01_03:26:09 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.492636e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.583078e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.097014e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 +TOTAL : 1.329039 sec + 4,626,136,964 cycles # 2.966 GHz + 7,229,705,832 instructions # 1.56 insn per cycle + 1.616136536 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.400069e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.612023e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.612023e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.120496e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.314160e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.314160e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.119023 sec - 17,915,758,115 cycles # 3.498 GHz - 44,072,070,040 instructions # 2.46 insn per cycle - 5.122383130 seconds time elapsed +TOTAL : 6.368910 sec + 19,436,039,687 cycles # 3.050 GHz + 44,075,637,403 instructions # 2.27 insn per cycle + 6.374367735 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.251714e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.897562e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.897562e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.684337e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.204179e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.204179e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.399887 sec - 11,900,907,245 cycles # 3.498 GHz - 30,996,702,046 instructions # 2.60 insn per cycle - 3.403003584 seconds time elapsed +TOTAL : 4.477126 sec + 13,840,650,655 cycles # 3.088 GHz + 31,000,398,658 instructions # 2.24 insn per cycle + 4.482579907 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.718441e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.683164e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.683164e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.074274e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.910197e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.910197e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.889969 sec - 9,778,075,616 cycles # 3.381 GHz - 19,263,677,928 instructions # 1.97 insn per cycle - 2.893107114 seconds time elapsed +TOTAL : 3.779571 sec + 11,221,356,305 cycles # 2.967 GHz + 19,268,573,834 instructions # 1.72 insn per cycle + 3.784933241 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.796994e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.822794e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.822794e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.174998e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.082449e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.082449e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.839931 sec - 9,615,587,176 cycles # 3.383 GHz - 18,673,582,472 instructions # 1.94 insn per cycle - 2.843100077 seconds time elapsed +TOTAL : 3.643336 sec + 10,818,026,445 cycles # 2.966 GHz + 18,676,470,141 instructions # 1.73 insn per cycle + 3.648853496 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.505324e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.277259e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.277259e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.875863e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.507498e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.507498e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.384419 sec - 8,111,960,646 cycles # 3.399 GHz - 15,421,322,877 instructions # 1.90 insn per cycle - 2.387622919 seconds time elapsed +TOTAL : 4.111357 sec + 9,725,602,646 cycles # 2.364 GHz + 15,429,502,829 instructions # 1.59 insn per cycle + 4.116843302 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt index 9bceb91dab..c5fdf6f3c6 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_curhst.txt @@ -1,138 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:34:05 +DATE: 2024-03-01_03:22:56 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.511929e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.606834e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.132028e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.970663 sec + 3,681,129,197 cycles # 3.043 GHz + 7,185,953,404 instructions # 1.95 insn per cycle + 1.266725293 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe: Aborted - 4,565,469 cycles # 3.250 GHz - 6,288,877 instructions # 1.38 insn per cycle - 0.042167323 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.129015e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.325606e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.325606e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 5.971935 sec + 18,327,370,852 cycles # 3.067 GHz + 43,971,442,751 instructions # 2.40 insn per cycle + 5.977352348 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe: Aborted - 4,579,291 cycles # 2.729 GHz - 6,314,135 instructions # 1.38 insn per cycle - 0.039024734 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.658250e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.168305e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.168305e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.196892 sec + 12,732,971,160 cycles # 3.031 GHz + 30,998,026,084 instructions # 2.43 insn per cycle + 4.202372987 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe: Aborted - 4,579,990 cycles # 2.736 GHz - 6,320,509 instructions # 1.38 insn per cycle - 0.038108900 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.058430e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.883101e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.883101e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.446922 sec + 10,145,804,321 cycles # 2.940 GHz + 19,366,948,979 instructions # 1.91 insn per cycle + 3.452452971 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe: Aborted - 4,556,853 cycles # 2.681 GHz - 6,314,123 instructions # 1.39 insn per cycle - 0.038394657 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.138596e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.023243e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.023243e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.339653 sec + 9,693,126,342 cycles # 2.898 GHz + 18,976,550,822 instructions # 1.96 insn per cycle + 3.345442131 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe: Aborted - 4,605,984 cycles # 2.673 GHz - 6,322,733 instructions # 1.37 insn per cycle - 0.038325628 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.879529e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.506982e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.506982e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.741561 sec + 8,595,853,951 cycles # 2.295 GHz + 15,727,211,339 instructions # 1.83 insn per cycle + 3.747065146 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt index dd5c05e2b0..4a4acadae4 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd0_rmbhst.txt @@ -1,183 +1,208 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:32:16 +DATE: 2024-03-01_03:19:38 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.223584e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.552038e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.038459e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 1.841184 sec + 6,281,268,865 cycles # 3.032 GHz + 11,616,541,551 instructions # 1.85 insn per cycle + 2.127335919 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.409868e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.625204e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.625204e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.085348 sec - 17,784,058,163 cycles # 3.496 GHz - 44,072,184,676 instructions # 2.48 insn per cycle - 5.088440281 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.136861e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.332827e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.332827e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 5.931254 sec + 18,320,874,631 cycles # 3.087 GHz + 43,971,483,251 instructions # 2.40 insn per cycle + 5.936943481 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 433) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.251897e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.896740e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.896740e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.399875 sec - 11,894,015,527 cycles # 3.496 GHz - 30,996,681,554 instructions # 2.61 insn per cycle - 3.403119350 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.678735e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.191487e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.191487e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.142725 sec + 12,747,370,194 cycles # 3.074 GHz + 30,997,666,885 instructions # 2.43 insn per cycle + 4.148307465 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1644) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.713038e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.676668e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.676668e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.893827 sec - 9,788,402,080 cycles # 3.380 GHz - 19,263,518,574 instructions # 1.97 insn per cycle - 2.897020495 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.080045e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.910176e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.910176e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.411600 sec + 10,085,079,136 cycles # 2.953 GHz + 19,364,558,625 instructions # 1.92 insn per cycle + 3.417084709 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1966) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.802736e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.835064e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.835064e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.835699 sec - 9,598,762,677 cycles # 3.382 GHz - 18,673,739,851 instructions # 1.95 insn per cycle - 2.838932444 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.138969e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.032835e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.032835e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.338836 sec + 9,731,023,917 cycles # 2.911 GHz + 18,988,816,377 instructions # 1.95 insn per cycle + 3.344328310 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1690) (512y: 181) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.529044e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.287257e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.287257e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.372119 sec - 8,060,639,797 cycles # 3.395 GHz - 15,420,889,940 instructions # 1.91 insn per cycle - 2.375266541 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.865281e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.489559e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.489559e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.766791 sec + 8,586,243,314 cycles # 2.277 GHz + 15,726,194,960 instructions # 1.83 insn per cycle + 3.772300478 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 901) (512y: 154) (512z: 1258) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt index c765405708..acaec4a100 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl0_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:08:29 +DATE: 2024-03-01_02:24:28 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.477749e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.322801e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.215924e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.699180 sec + 2,815,032,547 cycles # 3.020 GHz + 4,411,732,319 instructions # 1.57 insn per cycle + 1.012826906 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.526914e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.781748e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.781748e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.743355 sec - 16,588,877,180 cycles # 3.495 GHz - 41,918,002,874 instructions # 2.53 insn per cycle - 4.746678875 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.177941e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.396494e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.396494e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 5.744811 sec + 17,454,360,700 cycles # 3.039 GHz + 41,822,159,126 instructions # 2.40 insn per cycle + 5.754685240 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 392) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.291295e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.968808e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.968808e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.352770 sec - 11,735,836,966 cycles # 3.498 GHz - 30,158,811,446 instructions # 2.57 insn per cycle - 3.356116759 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.724349e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.269291e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.269291e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.046627 sec + 12,493,235,601 cycles # 3.083 GHz + 30,160,547,265 instructions # 2.41 insn per cycle + 4.067076512 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1612) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.762119e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.774937e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.774937e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.859880 sec - 9,680,972,149 cycles # 3.382 GHz - 18,995,343,677 instructions # 1.96 insn per cycle - 2.863281821 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.121345e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.968992e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.968992e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.357760 sec + 9,927,136,910 cycles # 2.952 GHz + 19,096,793,241 instructions # 1.92 insn per cycle + 3.375474470 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1931) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.833992e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.898752e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.898752e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.816063 sec - 9,537,877,818 cycles # 3.384 GHz - 18,442,918,066 instructions # 1.93 insn per cycle - 2.819507820 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.204942e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.126738e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.126738e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.243150 sec + 9,616,213,299 cycles # 2.960 GHz + 18,757,748,925 instructions # 1.95 insn per cycle + 3.265371118 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1662) (512y: 178) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165090E-002 Relative difference = 1.0277089176796747e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.539315e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.305642e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.305642e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.365877 sec - 8,037,965,784 cycles # 3.394 GHz - 15,297,089,513 instructions # 1.90 insn per cycle - 2.369192143 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.914682e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.579340e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.579340e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.680994 sec + 8,464,459,891 cycles # 2.296 GHz + 15,603,182,673 instructions # 1.84 insn per cycle + 3.700542167 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 887) (512y: 156) (512z: 1239) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt index a63eab528c..5e36a6ad1c 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:22:33 +DATE: 2024-03-01_03:02:07 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.482201e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.589772e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.144008e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.677531 sec + 2,738,360,567 cycles # 3.010 GHz + 4,202,554,319 instructions # 1.53 insn per cycle + 0.971727419 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.166985e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.720632e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.720632e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.530132 sec - 12,355,611,055 cycles # 3.498 GHz - 32,613,744,062 instructions # 2.64 insn per cycle - 3.533621370 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.697362e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.176157e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.176157e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.107132 sec + 12,669,493,888 cycles # 3.081 GHz + 32,513,570,576 instructions # 2.57 insn per cycle + 4.112837024 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 296) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.915578e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.095121e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.095121e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.764375 sec - 9,674,764,507 cycles # 3.497 GHz - 24,473,407,704 instructions # 2.53 insn per cycle - 2.767795138 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.109105e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.012747e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.012747e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.385880 sec + 10,259,128,837 cycles # 3.025 GHz + 24,473,597,991 instructions # 2.39 insn per cycle + 3.391687112 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1251) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.020535e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.265971e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.265971e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.659102 sec - 9,008,630,968 cycles # 3.385 GHz - 16,821,530,421 instructions # 1.87 insn per cycle - 2.662558205 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.263099e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.319180e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.319180e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.179158 sec + 9,139,183,085 cycles # 2.870 GHz + 16,922,980,195 instructions # 1.85 insn per cycle + 3.185130704 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1631) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.305206e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.841692e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.841692e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.493754 sec - 8,724,117,803 cycles # 3.495 GHz - 16,041,038,377 instructions # 1.84 insn per cycle - 2.497150137 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.177097e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.324804e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.324804e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.299126 sec + 9,225,486,663 cycles # 2.804 GHz + 16,350,529,622 instructions # 1.77 insn per cycle + 3.305119215 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1370) (512y: 139) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.721147e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.731450e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.731450e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.278737 sec - 7,752,022,969 cycles # 3.398 GHz - 14,276,736,178 instructions # 1.84 insn per cycle - 2.282116900 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.061533e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.856351e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.856351e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.449960 sec + 7,914,148,444 cycles # 2.292 GHz + 14,582,993,732 instructions # 1.84 insn per cycle + 3.455623027 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1015) (512y: 158) (512z: 955) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt index 1680b26f09..640cde8efe 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_d_inl1_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:22:52 +DATE: 2024-03-01_03:02:37 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.480008e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.624168e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.202092e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.676373 sec + 2,668,503,996 cycles # 2.929 GHz + 4,153,523,497 instructions # 1.56 insn per cycle + 0.971892133 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039868165201E-002 +Relative difference = 1.0277080522138477e-08 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.961846e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.105876e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.105876e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.753891 sec - 9,636,283,642 cycles # 3.496 GHz - 25,494,464,469 instructions # 2.65 insn per cycle - 2.757264718 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.254295e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.186891e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.186891e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.188433 sec + 9,833,021,244 cycles # 3.080 GHz + 25,393,539,961 instructions # 2.58 insn per cycle + 3.194101979 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 249) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.533991e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.440258e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.440258e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.386937 sec - 8,350,886,622 cycles # 3.495 GHz - 21,482,443,122 instructions # 2.57 insn per cycle - 2.390265855 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.515638e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.869932e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.869932e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.899703 sec + 8,920,893,128 cycles # 3.072 GHz + 21,482,466,118 instructions # 2.41 insn per cycle + 2.905533602 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1119) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868164916E-002 Relative difference = 1.0277102699700292e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.414671e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.103227e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.103227e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.417911 sec - 8,459,925,330 cycles # 3.495 GHz - 15,709,703,737 instructions # 1.86 insn per cycle - 2.421288962 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.523191e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.858970e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.858970e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.882396 sec + 8,595,793,495 cycles # 2.978 GHz + 15,810,706,009 instructions # 1.84 insn per cycle + 2.888136564 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1494) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.582488e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.453415e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.453415e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.347054 sec - 8,209,972,658 cycles # 3.494 GHz - 15,201,153,143 instructions # 1.85 insn per cycle - 2.350350106 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.508044e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.828642e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.828642e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 2.905551 sec + 8,435,887,633 cycles # 2.898 GHz + 15,503,428,881 instructions # 1.84 insn per cycle + 2.911395780 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1268) (512y: 139) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.888127e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.125675e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.125675e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.206515 sec - 7,514,566,109 cycles # 3.402 GHz - 13,978,476,813 instructions # 1.86 insn per cycle - 2.209835018 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.236518e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.188285e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.188285e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.208349 sec + 7,562,205,797 cycles # 2.353 GHz + 14,282,233,625 instructions # 1.89 insn per cycle + 3.214128577 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1041) (512y: 164) (512z: 874) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039868165088E-002 Relative difference = 1.0277089312025782e-08 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt index e23690073e..4388b968c1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:08:50 +DATE: 2024-03-01_02:25:01 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.096246e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.080730e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.278086e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.584592 sec + 2,424,873,450 cycles # 2.992 GHz + 3,757,113,510 instructions # 1.55 insn per cycle + 0.891497126 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282802e-02 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.579769e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.893874e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.893874e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 4.588717 sec - 16,042,603,900 cycles # 3.494 GHz - 43,689,952,449 instructions # 2.72 insn per cycle - 4.591858539 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.144766e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.356973e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.356973e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 5.861200 sec + 17,835,681,737 cycles # 3.040 GHz + 43,512,863,183 instructions # 2.44 insn per cycle + 5.870178360 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.164150e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.789236e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.789236e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.577598 sec - 9,012,166,111 cycles # 3.494 GHz - 21,985,206,002 instructions # 2.44 insn per cycle - 2.580747570 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.374028e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.640654e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.640654e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.010180 sec + 9,264,818,102 cycles # 3.072 GHz + 21,907,230,972 instructions # 2.36 insn per cycle + 3.030108679 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.413033e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.053155e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.053155e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.401302 sec - 8,147,553,270 cycles # 3.390 GHz - 15,499,004,047 instructions # 1.90 insn per cycle - 2.404474906 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.583102e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.970498e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.970498e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.786671 sec + 8,293,439,755 cycles # 2.970 GHz + 15,591,050,714 instructions # 1.88 insn per cycle + 2.803351674 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.441969e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.116891e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.116891e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.399786 sec - 8,145,366,561 cycles # 3.391 GHz - 15,141,676,425 instructions # 1.86 insn per cycle - 2.402973985 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.519812e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.882018e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.882018e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.857922 sec + 8,240,284,445 cycles # 2.878 GHz + 15,434,807,288 instructions # 1.87 insn per cycle + 2.873134335 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.532625e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.114438e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.114438e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 1.959359 sec - 6,688,058,045 cycles # 3.409 GHz - 12,568,020,375 instructions # 1.88 insn per cycle - 1.962540189 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.640401e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.080150e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.080150e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.738177 sec + 6,634,758,903 cycles # 2.418 GHz + 12,863,535,626 instructions # 1.94 insn per cycle + 2.752418443 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt index 09e54302f1..5ebf98d844 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_bridge.txt @@ -1,188 +1,219 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:29:20 +DATE: 2024-03-01_03:13:35 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.291092e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.500878e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.500878e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 +TOTAL : 1.664885 sec + 5,743,008,286 cycles # 3.032 GHz + 10,353,112,228 instructions # 1.80 insn per cycle + 1.950710268 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282802e-02 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.560582e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.868731e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.868731e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 4.680582 sec - 16,368,713,672 cycles # 3.495 GHz - 43,840,413,084 instructions # 2.68 insn per cycle - 4.684353022 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.118079e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.318846e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.318846e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 6.094512 sec + 18,492,834,117 cycles # 3.035 GHz + 43,665,828,462 instructions # 2.36 insn per cycle + 6.100764200 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.038598e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.508158e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.508158e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.758988 sec - 9,660,848,077 cycles # 3.498 GHz - 23,320,516,277 instructions # 2.41 insn per cycle - 2.762808644 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.278046e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.410824e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.410824e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.242674 sec + 9,984,073,322 cycles # 3.074 GHz + 23,241,211,318 instructions # 2.33 insn per cycle + 3.248988906 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.263246e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.739456e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.739456e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.576121 sec - 8,764,669,029 cycles # 3.398 GHz - 16,619,915,020 instructions # 1.90 insn per cycle - 2.579987789 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.460715e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.687913e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.687913e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.031931 sec + 9,018,287,343 cycles # 2.969 GHz + 16,710,480,351 instructions # 1.85 insn per cycle + 3.038355322 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.291714e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.793688e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.793688e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.573938 sec - 8,759,700,956 cycles # 3.399 GHz - 16,262,157,562 instructions # 1.86 insn per cycle - 2.577692084 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.487042e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.742069e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.742069e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 3.003313 sec + 8,924,279,581 cycles # 2.966 GHz + 16,553,851,203 instructions # 1.85 insn per cycle + 3.009721457 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.243100e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.227413e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.227413e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.161827 sec - 7,403,968,795 cycles # 3.420 GHz - 13,776,638,645 instructions # 1.86 insn per cycle - 2.165624801 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.456097e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.675362e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.675362e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 3.047824 sec + 7,411,564,908 cycles # 2.428 GHz + 14,070,800,087 instructions # 1.90 insn per cycle + 3.054259465 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt index 22febd8bf2..57f3a9eb6a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_common.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:35:03 +DATE: 2024-03-01_03:26:45 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.305418e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.176873e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.254170e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371863e-02 +- 3.269951e-06 ) GeV^0 +TOTAL : 1.176348 sec + 4,160,459,328 cycles # 2.977 GHz + 6,608,736,714 instructions # 1.59 insn per cycle + 1.454481545 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282802e-02 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.580543e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.894945e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.894945e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.163258e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.379965e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.379965e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 4.585144 sec - 16,033,044,830 cycles # 3.495 GHz - 43,690,102,128 instructions # 2.73 insn per cycle - 4.588130273 seconds time elapsed +TOTAL : 6.084905 sec + 18,848,150,042 cycles # 3.095 GHz + 43,694,410,467 instructions # 2.32 insn per cycle + 6.090122961 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.165795e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.795351e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.795351e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.362188e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.607795e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.607795e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.575915 sec - 9,005,888,202 cycles # 3.493 GHz - 21,985,016,188 instructions # 2.44 insn per cycle - 2.578895980 seconds time elapsed +TOTAL : 3.340145 sec + 10,237,006,523 cycles # 3.061 GHz + 21,987,992,116 instructions # 2.15 insn per cycle + 3.345494687 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.416776e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.062032e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.062032e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.557177e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.937995e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.937995e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.398556 sec - 8,138,328,552 cycles # 3.390 GHz - 15,499,199,008 instructions # 1.90 insn per cycle - 2.401542629 seconds time elapsed +TOTAL : 3.130033 sec + 9,276,164,079 cycles # 2.959 GHz + 15,501,530,354 instructions # 1.67 insn per cycle + 3.135291294 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.447614e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.125962e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.125962e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.607828e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.022471e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.022471e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.395743 sec - 8,131,915,937 cycles # 3.391 GHz - 15,135,872,991 instructions # 1.86 insn per cycle - 2.398760307 seconds time elapsed +TOTAL : 3.090209 sec + 9,218,829,691 cycles # 2.980 GHz + 15,143,949,757 instructions # 1.64 insn per cycle + 3.095551418 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.538033e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.123539e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.123539e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.625698e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.049871e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.049871e+06 ) sec^-1 MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 1.958851 sec - 6,688,513,152 cycles # 3.411 GHz - 12,568,494,145 instructions # 1.88 insn per cycle - 1.961860292 seconds time elapsed +TOTAL : 3.081111 sec + 7,633,670,846 cycles # 2.474 GHz + 12,572,894,419 instructions # 1.65 insn per cycle + 3.086406325 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt index fc344cccba..72f866059b 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_curhst.txt @@ -1,138 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:34:09 +DATE: 2024-03-01_03:23:30 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.312185e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.188856e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.293387e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.840959 sec + 3,233,651,545 cycles # 3.031 GHz + 6,593,293,750 instructions # 2.04 insn per cycle + 1.123835132 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282802e-02 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe: Aborted - 4,566,934 cycles # 3.256 GHz - 6,299,440 instructions # 1.38 insn per cycle - 0.038188510 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.165423e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.380976e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.380976e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 5.753852 sec + 17,814,734,742 cycles # 3.094 GHz + 43,512,567,450 instructions # 2.44 insn per cycle + 5.759197636 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe: Aborted - 4,511,989 cycles # 2.673 GHz - 6,385,849 instructions # 1.42 insn per cycle - 0.039615964 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.367425e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.644557e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.644557e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.015821 sec + 9,302,641,553 cycles # 3.081 GHz + 21,907,397,717 instructions # 2.35 insn per cycle + 3.021054890 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe: Aborted - 4,505,776 cycles # 2.678 GHz - 6,324,297 instructions # 1.40 insn per cycle - 0.039078577 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.605570e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.994881e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.994881e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.763364 sec + 8,259,626,138 cycles # 2.984 GHz + 15,589,955,941 instructions # 1.89 insn per cycle + 2.768827600 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe: Aborted - 4,566,500 cycles # 3.261 GHz - 6,322,428 instructions # 1.38 insn per cycle - 0.037934624 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.581356e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.971929e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.971929e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.794808 sec + 8,189,932,997 cycles # 2.926 GHz + 15,434,468,382 instructions # 1.88 insn per cycle + 2.800117026 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe: Aborted - 4,587,833 cycles # 3.231 GHz - 6,330,442 instructions # 1.38 insn per cycle - 0.038651242 seconds time elapsed +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.644746e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.098711e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.098711e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.731125 sec + 6,642,886,027 cycles # 2.429 GHz + 12,862,690,732 instructions # 1.94 insn per cycle + 2.736362886 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt index 49d434f85c..8d8716bc9a 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd0_rmbhst.txt @@ -1,183 +1,208 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:32:37 +DATE: 2024-03-01_03:20:14 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 12 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.282885e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.142631e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.141870e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371710e-02 +- 3.270389e-06 ) GeV^0 +TOTAL : 1.452761 sec + 5,067,036,613 cycles # 3.030 GHz + 9,262,361,364 instructions # 1.83 insn per cycle + 1.731002061 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282802e-02 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.581672e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.896680e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.896680e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 4.582644 sec - 16,029,325,021 cycles # 3.496 GHz - 43,690,424,802 instructions # 2.73 insn per cycle - 4.585651317 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.160324e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.375621e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.375621e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 5.780149 sec + 17,815,433,670 cycles # 3.080 GHz + 43,511,102,764 instructions # 2.44 insn per cycle + 5.785180938 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 431) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.149026e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.792646e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.792646e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.584919 sec - 9,045,554,644 cycles # 3.496 GHz - 21,985,410,240 instructions # 2.43 insn per cycle - 2.587906690 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.389771e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.650423e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.650423e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.992624 sec + 9,227,327,267 cycles # 3.079 GHz + 21,906,426,544 instructions # 2.37 insn per cycle + 2.997895192 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1938) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.414111e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.052877e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.052877e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.399307 sec - 8,139,048,725 cycles # 3.389 GHz - 15,499,397,322 instructions # 1.90 insn per cycle - 2.402310060 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.528530e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.865855e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.865855e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.845512 sec + 8,254,984,848 cycles # 2.896 GHz + 15,590,498,904 instructions # 1.89 insn per cycle + 2.850900280 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2596) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.448749e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.125723e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.125723e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.394319 sec - 8,124,895,108 cycles # 3.390 GHz - 15,136,296,147 instructions # 1.86 insn per cycle - 2.397299852 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.609279e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.018312e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.018312e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.764714 sec + 8,215,374,590 cycles # 2.969 GHz + 15,429,066,515 instructions # 1.88 insn per cycle + 2.770036927 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2496) (512y: 9) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe -p 2048 256 12 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.541933e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.127583e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.127583e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 1.954962 sec - 6,675,259,521 cycles # 3.411 GHz - 12,568,022,737 instructions # 1.88 insn per cycle - 1.957882207 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.648656e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.090784e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.090784e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.731162 sec + 6,615,238,340 cycles # 2.419 GHz + 12,862,797,254 instructions # 1.94 insn per cycle + 2.736410000 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1736) (512y: 17) (512z: 1439) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052585973637E-002 Relative difference = 2.0158743040564767e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt index 4dc37a29ce..f9e4000e6d 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl0_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:09:08 +DATE: 2024-03-01_02:25:31 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.096943e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.095054e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.337200e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.581297 sec + 2,416,875,461 cycles # 3.000 GHz + 3,802,904,431 instructions # 1.57 insn per cycle + 0.886522859 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282802e-02 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.720434e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.100723e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.100723e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 4.265109 sec - 14,915,458,746 cycles # 3.495 GHz - 41,447,917,911 instructions # 2.78 insn per cycle - 4.268275411 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.237656e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.486670e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.486670e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 5.444566 sec + 16,726,225,777 cycles # 3.070 GHz + 41,270,625,621 instructions # 2.47 insn per cycle + 5.454849598 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 375) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039854866802E-002 Relative difference = 1.1313746984080878e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.260922e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.026866e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.026866e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.519340 sec - 8,813,391,527 cycles # 3.495 GHz - 21,289,273,651 instructions # 2.42 insn per cycle - 2.522478814 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.460514e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.827007e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.827007e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.914617 sec + 8,996,783,237 cycles # 3.081 GHz + 21,210,998,059 instructions # 2.36 insn per cycle + 2.929493898 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1843) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.427524e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.128695e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.128695e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.394514 sec - 8,126,459,965 cycles # 3.390 GHz - 15,333,379,094 instructions # 1.89 insn per cycle - 2.397695798 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.611163e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.022551e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.022551e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.760181 sec + 8,249,336,928 cycles # 2.983 GHz + 15,425,238,678 instructions # 1.87 insn per cycle + 2.778856529 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2537) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.456500e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.144948e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.144948e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.391151 sec - 8,113,737,929 cycles # 3.390 GHz - 14,939,981,190 instructions # 1.84 insn per cycle - 2.394304414 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.587140e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.018405e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.018405e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.789811 sec + 8,096,556,575 cycles # 2.897 GHz + 15,238,891,903 instructions # 1.88 insn per cycle + 2.804859872 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2423) (512y: 8) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053255361738E-002 Relative difference = 2.5376902468575066e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.545530e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.144301e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.144301e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 1.956043 sec - 6,680,719,629 cycles # 3.411 GHz - 12,547,036,461 instructions # 1.88 insn per cycle - 1.959245995 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.644016e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.094854e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.094854e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.735992 sec + 6,623,617,660 cycles # 2.417 GHz + 12,843,079,376 instructions # 1.94 insn per cycle + 2.752411310 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1706) (512y: 18) (512z: 1427) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052564145764E-002 Relative difference = 1.9988585667912256e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt index 5416692647..fde060de72 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:23:08 +DATE: 2024-03-01_03:03:05 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.224284e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.181869e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.290244e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.576138 sec + 2,415,755,755 cycles # 3.001 GHz + 3,734,378,655 instructions # 1.55 insn per cycle + 0.864225849 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 117 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282802e-02 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.220532e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.908131e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.908131e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 3.440500 sec - 12,037,718,902 cycles # 3.497 GHz - 32,611,773,785 instructions # 2.71 insn per cycle - 3.443737801 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.727035e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.251286e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.251286e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 4.002640 sec + 12,159,409,273 cycles # 3.035 GHz + 32,432,694,101 instructions # 2.67 insn per cycle + 4.008158303 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 312) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039840314887E-002 Relative difference = 1.244813035273009e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.735307e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.227346e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.227346e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.275375 sec - 7,963,357,113 cycles # 3.496 GHz - 18,736,043,078 instructions # 2.35 insn per cycle - 2.278622500 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.805511e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.765564e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.765564e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.601867 sec + 7,999,882,010 cycles # 3.069 GHz + 18,656,600,340 instructions # 2.33 insn per cycle + 2.607493343 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1555) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039283704129E-002 Relative difference = 5.583829420356249e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.874885e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.138866e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.138866e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.180630 sec - 7,418,462,702 cycles # 3.398 GHz - 14,160,332,853 instructions # 1.91 insn per cycle - 2.183917049 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.939924e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.842069e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.842069e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.492780 sec + 7,427,313,914 cycles # 2.974 GHz + 14,251,086,474 instructions # 1.92 insn per cycle + 2.498394316 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2237) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053244447801E-002 Relative difference = 2.5291823782248813e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.068100e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.691798e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.691798e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.120331 sec - 7,409,259,501 cycles # 3.490 GHz - 13,649,785,865 instructions # 1.84 insn per cycle - 2.123584431 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.004272e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.034488e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.034488e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.444620 sec + 7,299,238,549 cycles # 2.980 GHz + 13,947,633,533 instructions # 1.91 insn per cycle + 2.450212772 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2096) (512y: 3) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053244447801E-002 Relative difference = 2.5291823782248813e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.512909e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.081395e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.081395e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 1.966622 sec - 6,716,721,759 cycles # 3.411 GHz - 13,128,043,043 instructions # 1.95 insn per cycle - 1.969885224 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.706121e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.223606e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.223606e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.681955 sec + 6,492,318,128 cycles # 2.417 GHz + 13,422,094,611 instructions # 2.07 insn per cycle + 2.687432186 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2071) (512y: 1) (512z: 1198) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052562326775E-002 Relative difference = 1.997440588685788e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt index ad73c3d757..0d6d3b3db1 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_f_inl1_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:23:25 +DATE: 2024-03-01_03:03:32 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.215876e+08 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.204111e+09 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.337047e+09 ) sec^-1 +MeanMatrixElemValue = ( 1.371687e-02 +- 3.270220e-06 ) GeV^0 +TOTAL : 0.576922 sec + 2,404,705,116 cycles # 2.985 GHz + 3,758,296,111 instructions # 1.56 insn per cycle + 0.864210592 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 95 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282802e-02 +Avg ME (F77/CUDA) = 1.2828112125134794E-002 +Relative difference = 7.1815552823662555e-06 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.058716e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.506875e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.506875e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270267e-06 ) GeV^0 -TOTAL : 2.666546 sec - 9,327,231,220 cycles # 3.496 GHz - 25,447,900,546 instructions # 2.73 insn per cycle - 2.669764195 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.296714e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.359904e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.359904e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 3.097656 sec + 9,472,450,742 cycles # 3.053 GHz + 25,268,175,697 instructions # 2.67 insn per cycle + 3.103042436 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 263) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039838495897E-002 Relative difference = 1.2589928273811243e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.323765e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.058451e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.058451e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371887e-02 +- 3.270266e-06 ) GeV^0 -TOTAL : 2.046425 sec - 7,160,329,051 cycles # 3.495 GHz - 16,947,102,157 instructions # 2.37 insn per cycle - 2.049696854 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.079795e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.704088e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.704088e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371707e-02 +- 3.270376e-06 ) GeV^0 +TOTAL : 2.415041 sec + 7,164,638,851 cycles # 2.961 GHz + 16,869,197,703 instructions # 2.35 insn per cycle + 2.420723497 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1360) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039280066150E-002 Relative difference = 5.612189004572479e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.297201e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.290316e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.290316e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.022038 sec - 7,066,919,623 cycles # 3.491 GHz - 13,525,128,890 instructions # 1.91 insn per cycle - 2.025234558 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.078168e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.319472e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.319472e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.394138 sec + 7,165,321,711 cycles # 2.987 GHz + 13,616,190,038 instructions # 1.90 insn per cycle + 2.399577311 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2060) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053220800939E-002 Relative difference = 2.5107486628541925e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.389695e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.530798e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.530798e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 2.007329 sec - 7,015,071,750 cycles # 3.491 GHz - 13,133,210,306 instructions # 1.87 insn per cycle - 2.010465766 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.136069e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.411751e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.411751e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270341e-06 ) GeV^0 +TOTAL : 2.363661 sec + 7,031,964,685 cycles # 2.970 GHz + 13,425,613,371 instructions # 1.91 insn per cycle + 2.369281481 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1945) (512y: 4) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828053220800939E-002 Relative difference = 2.5107486628541925e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.659878e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.534461e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.534461e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371885e-02 +- 3.270112e-06 ) GeV^0 -TOTAL : 1.922796 sec - 6,566,422,271 cycles # 3.411 GHz - 12,859,737,400 instructions # 1.96 insn per cycle - 1.926087575 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.811199e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.477443e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.477443e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270342e-06 ) GeV^0 +TOTAL : 2.592425 sec + 6,321,858,831 cycles # 2.434 GHz + 13,153,560,775 instructions # 2.08 insn per cycle + 2.597985755 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2029) (512y: 1) (512z: 1083) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282805e-02 Avg ME (F77/C++) = 1.2828052536860923E-002 Relative difference = 1.977588895209662e-07 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt index 837cf18a48..4be3e76490 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:09:26 +DATE: 2024-03-01_02:26:01 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.449419e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.301374e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.190967e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.717219 sec + 2,841,227,385 cycles # 2.957 GHz + 4,430,504,412 instructions # 1.56 insn per cycle + 1.049815549 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 166 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039901590279E-002 +Relative difference = 7.671454200650844e-09 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.377191e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.581422e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.581422e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 5.191610 sec - 18,159,288,198 cycles # 3.496 GHz - 44,323,801,591 instructions # 2.44 insn per cycle - 5.194897716 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.109294e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.297854e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.297854e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 6.069129 sec + 18,728,354,553 cycles # 3.083 GHz + 44,224,513,518 instructions # 2.36 insn per cycle + 6.079869673 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 439) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.293424e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.965077e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.965077e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.349210 sec - 11,716,680,806 cycles # 3.496 GHz - 30,916,285,925 instructions # 2.64 insn per cycle - 3.352547582 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.745615e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.315952e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.315952e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 4.001256 sec + 12,323,242,096 cycles # 3.075 GHz + 30,917,838,115 instructions # 2.51 insn per cycle + 4.017904894 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1685) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.736734e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.720058e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.720058e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.874813 sec - 9,725,993,159 cycles # 3.380 GHz - 19,272,349,716 instructions # 1.98 insn per cycle - 2.878123285 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.078908e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.902249e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.902249e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.416443 sec + 10,120,877,504 cycles # 2.958 GHz + 19,374,733,180 instructions # 1.91 insn per cycle + 3.431641491 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2130) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.837062e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.944737e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.944737e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.808701 sec - 9,507,947,404 cycles # 3.382 GHz - 18,641,360,147 instructions # 1.96 insn per cycle - 2.812001364 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.114347e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.979731e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.979731e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.374976 sec + 9,706,052,635 cycles # 2.871 GHz + 18,944,519,271 instructions # 1.95 insn per cycle + 3.395274500 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1860) (512y: 188) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.604258e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.479437e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.479437e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.334946 sec - 7,938,052,698 cycles # 3.396 GHz - 14,749,724,015 instructions # 1.86 insn per cycle - 2.338250895 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.874531e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.524823e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.524823e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.760847 sec + 8,409,257,244 cycles # 2.233 GHz + 15,057,436,319 instructions # 1.79 insn per cycle + 3.776930410 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1024) (512y: 155) (512z: 1316) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 diff --git a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt index 076a22f416..77001f8935 100644 --- a/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_eemumu_mad/log_eemumu_mad_m_inl0_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum' -DATE: 2024-03-01_19:09:47 +DATE: 2024-03-01_02:26:35 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 12 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_EPEM_MUPMUM_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.443987e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.284127e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.143740e+08 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 0.699538 sec + 2,805,342,043 cycles # 2.999 GHz + 4,414,010,673 instructions # 1.57 insn per cycle + 1.020206687 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 154 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.282804e-02 +Avg ME (F77/CUDA) = 1.2828039901590279E-002 +Relative difference = 7.671454200650844e-09 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.451232e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.679621e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.679621e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 4.958423 sec - 17,349,182,496 cycles # 3.497 GHz - 42,572,483,064 instructions # 2.45 insn per cycle - 4.961760502 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.155620e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.358194e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.358194e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 5.837265 sec + 18,090,198,997 cycles # 3.097 GHz + 42,472,863,850 instructions # 2.35 insn per cycle + 5.848007644 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 421) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.362004e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.081863e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.081863e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 3.272026 sec - 11,447,774,045 cycles # 3.496 GHz - 30,223,060,667 instructions # 2.64 insn per cycle - 3.275374923 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.786116e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.385279e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.385279e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.920672 sec + 12,137,736,337 cycles # 3.092 GHz + 30,225,042,392 instructions # 2.49 insn per cycle + 3.938311189 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 1692) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039952548879E-002 Relative difference = 3.6990156841838714e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.762711e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.775192e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.775192e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.854465 sec - 9,656,821,540 cycles # 3.380 GHz - 19,154,909,313 instructions # 1.98 insn per cycle - 2.857812496 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.068049e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.882124e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.882124e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.437770 sec + 10,015,371,277 cycles # 2.909 GHz + 19,256,811,213 instructions # 1.92 insn per cycle + 3.454377757 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2146) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.912614e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.047841e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.047841e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.751170 sec - 9,315,104,434 cycles # 3.383 GHz - 18,442,592,455 instructions # 1.98 insn per cycle - 2.754516121 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.207913e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.137874e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.137874e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.235635 sec + 9,645,810,411 cycles # 2.976 GHz + 18,756,051,671 instructions # 1.94 insn per cycle + 3.251774736 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1834) (512y: 191) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe -p 2048 256 12 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_EPEM_MUPMUM_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.659029e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.577201e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.577201e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.371879e-02 +- 3.270020e-06 ) GeV^0 -TOTAL : 2.308570 sec - 7,854,338,821 cycles # 3.399 GHz - 14,673,429,400 instructions # 1.87 insn per cycle - 2.311927939 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.969792e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.680976e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.680976e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 +TOTAL : 3.592139 sec + 8,293,535,644 cycles # 2.305 GHz + 14,979,176,568 instructions # 1.81 insn per cycle + 3.613399615 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1021) (512y: 156) (512z: 1305) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/ee_mumu.mad/SubProcesses/P1_epem_mupmum/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.282804e-02 Avg ME (F77/C++) = 1.2828039951670679E-002 Relative difference = 3.767475112924841e-09 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt index 38908c3fb1..9a5df19d5b 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:10:08 +DATE: 2024-03-01_02:27:08 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.025930e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.135524e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.271935e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.535145 sec + 2,303,454,226 cycles # 2.990 GHz + 3,249,200,622 instructions # 1.41 insn per cycle + 0.848848936 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.782462e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.856967e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.856967e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.897327 sec - 13,634,239,091 cycles # 3.496 GHz - 38,385,444,668 instructions # 2.82 insn per cycle - 3.900701537 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 674) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.185653e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.250591e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.250591e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.903669 sec + 15,175,795,116 cycles # 3.093 GHz + 38,374,949,840 instructions # 2.53 insn per cycle + 4.917105673 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515645 Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.802582e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.042466e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.042466e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.309068 sec - 8,081,737,607 cycles # 3.496 GHz - 24,573,373,219 instructions # 3.04 insn per cycle - 2.312471054 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.662249e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.860778e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.860778e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.968890 sec + 9,101,848,873 cycles # 3.060 GHz + 24,578,505,710 instructions # 2.70 insn per cycle + 2.986159008 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.804161e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.442356e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.442356e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.464920 sec - 4,882,777,358 cycles # 3.328 GHz - 11,230,055,186 instructions # 2.30 insn per cycle - 1.468368837 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.728560e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.222175e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.222175e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.936093 sec + 5,474,671,571 cycles # 2.819 GHz + 11,252,385,098 instructions # 2.06 insn per cycle + 1.954008279 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.365690e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.101366e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.101366e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.377099 sec - 4,595,020,426 cycles # 3.330 GHz - 10,504,303,399 instructions # 2.29 insn per cycle - 1.380458418 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.292169e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.895497e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.895497e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.774092 sec + 4,972,729,611 cycles # 2.794 GHz + 10,557,445,760 instructions # 2.12 insn per cycle + 1.789622209 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.158651e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.839395e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.839395e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.408011 sec - 4,692,884,605 cycles # 3.327 GHz - 7,735,219,640 instructions # 1.65 insn per cycle - 1.411361186 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.894024e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.109310e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.109310e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.799185 sec + 5,395,066,029 cycles # 1.924 GHz + 7,793,871,634 instructions # 1.44 insn per cycle + 2.817161041 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt index 3f9ce1ce83..598396a8e7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_bridge.txt @@ -1,188 +1,219 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:29:40 +DATE: 2024-03-01_03:14:07 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.569533e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.877038e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.877038e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.801549 sec + 3,157,604,220 cycles # 3.025 GHz + 4,827,294,021 instructions # 1.53 insn per cycle + 1.101037847 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.768798e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.843131e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.843131e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.955815 sec - 13,832,169,175 cycles # 3.494 GHz - 38,450,905,414 instructions # 2.78 insn per cycle - 3.959789848 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 674) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.171920e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.234476e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.234476e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 5.008942 sec + 15,497,351,856 cycles # 3.090 GHz + 38,433,512,801 instructions # 2.48 insn per cycle + 5.015755142 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515645 Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.744115e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.977846e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.977846e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.391627 sec - 8,365,963,654 cycles # 3.493 GHz - 24,752,915,009 instructions # 2.96 insn per cycle - 2.395600644 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.610749e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.808660e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.808660e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.090616 sec + 9,430,020,802 cycles # 3.049 GHz + 24,763,068,407 instructions # 2.63 insn per cycle + 3.097621879 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.710929e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.332597e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.332597e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.535175 sec - 5,133,184,909 cycles # 3.337 GHz - 11,512,737,529 instructions # 2.24 insn per cycle - 1.539209812 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.825746e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.328246e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.328246e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.984017 sec + 5,826,620,771 cycles # 2.928 GHz + 11,538,062,844 instructions # 1.98 insn per cycle + 1.990946794 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.182780e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.885497e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.885497e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.458979 sec - 4,882,301,300 cycles # 3.339 GHz - 10,787,066,129 instructions # 2.21 insn per cycle - 1.463011407 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.484023e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.101551e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.101551e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.799262 sec + 5,294,562,816 cycles # 2.933 GHz + 10,843,404,980 instructions # 2.05 insn per cycle + 1.806082483 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.015903e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.677662e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.677662e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.490826 sec - 4,984,106,886 cycles # 3.336 GHz - 7,975,592,103 instructions # 1.60 insn per cycle - 1.494900312 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.045937e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.276782e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.276782e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.778138 sec + 5,743,518,580 cycles # 2.063 GHz + 8,037,207,687 instructions # 1.40 insn per cycle + 2.785184310 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt index bbbf7c9fcf..977053e874 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_common.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:35:21 +DATE: 2024-03-01_03:27:17 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.571348e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.154956e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.272098e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 +TOTAL : 0.617245 sec + 2,532,813,012 cycles # 2.999 GHz + 3,701,870,616 instructions # 1.46 insn per cycle + 0.904006340 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.781344e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.856090e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.856090e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.183394e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.247420e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.247420e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.899236 sec - 13,635,452,338 cycles # 3.495 GHz - 38,385,453,862 instructions # 2.82 insn per cycle - 3.902350302 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 674) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.966854 sec + 15,343,121,883 cycles # 3.087 GHz + 38,390,661,623 instructions # 2.50 insn per cycle + 4.972403311 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515645 Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.795094e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.033476e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.033476e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.599283e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.796561e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.796561e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.312363 sec - 8,089,982,420 cycles # 3.495 GHz - 24,573,716,651 instructions # 3.04 insn per cycle - 2.315558026 seconds time elapsed +TOTAL : 3.079495 sec + 9,279,730,828 cycles # 3.010 GHz + 24,577,932,954 instructions # 2.65 insn per cycle + 3.085060857 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.852044e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.495498e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.495498e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.908259e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.435116e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.435116e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.456479 sec - 4,857,277,999 cycles # 3.329 GHz - 11,230,242,008 instructions # 2.31 insn per cycle - 1.459674299 seconds time elapsed +TOTAL : 1.937503 sec + 5,654,473,993 cycles # 2.911 GHz + 11,233,989,199 instructions # 1.99 insn per cycle + 1.943141738 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.332356e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.056692e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.056692e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.578665e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.217153e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.217153e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.381870 sec - 4,611,557,382 cycles # 3.331 GHz - 10,504,643,617 instructions # 2.28 insn per cycle - 1.385061393 seconds time elapsed +TOTAL : 1.757396 sec + 5,128,637,723 cycles # 2.910 GHz + 10,505,547,256 instructions # 2.05 insn per cycle + 1.762900213 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.304542e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.010565e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.010565e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.070979e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.306684e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.306684e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.385516 sec - 4,620,699,813 cycles # 3.329 GHz - 7,735,038,876 instructions # 1.67 insn per cycle - 1.388698104 seconds time elapsed +TOTAL : 2.739915 sec + 5,558,468,681 cycles # 2.025 GHz + 7,741,606,815 instructions # 1.39 insn per cycle + 2.745378653 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt index f421a264ca..29a670398e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_curhst.txt @@ -1,138 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:34:14 +DATE: 2024-03-01_03:24:00 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.579097e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155655e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.270242e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.554530 sec + 2,358,271,315 cycles # 3.013 GHz + 3,682,090,929 instructions # 1.56 insn per cycle + 0.840283729 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe: Aborted - 4,520,752 cycles # 3.261 GHz - 6,298,367 instructions # 1.39 insn per cycle - 0.037814907 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 674) (avx2: 0) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.177843e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.241689e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.241689e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.919493 sec + 15,156,700,875 cycles # 3.078 GHz + 38,373,397,442 instructions # 2.53 insn per cycle + 4.925048190 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515645 Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe: Aborted - 4,515,960 cycles # 3.261 GHz - 6,261,612 instructions # 1.39 insn per cycle - 0.038098825 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.588081e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.785746e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.785746e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.028765 sec + 9,114,596,397 cycles # 3.011 GHz + 24,581,732,536 instructions # 2.70 insn per cycle + 3.034354491 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe: Aborted - 4,384,140 cycles # 2.671 GHz - 6,269,302 instructions # 1.43 insn per cycle - 0.039362811 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.938829e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.476997e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.476997e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.869884 sec + 5,467,539,853 cycles # 2.917 GHz + 11,251,237,475 instructions # 2.06 insn per cycle + 1.875504692 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe: Aborted - 4,828,863 cycles # 3.264 GHz - 6,315,171 instructions # 1.31 insn per cycle - 0.038710635 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.273575e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.896545e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.896545e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.780053 sec + 4,944,261,583 cycles # 2.770 GHz + 10,558,833,446 instructions # 2.14 insn per cycle + 1.785881884 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe: Aborted - 4,574,413 cycles # 2.717 GHz - 6,280,420 instructions # 1.37 insn per cycle - 0.038677314 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.090701e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.328087e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.328087e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.667720 sec + 5,371,754,599 cycles # 2.010 GHz + 7,792,372,952 instructions # 1.45 insn per cycle + 2.673339648 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt index 0b4ae759d7..e5cfc13b3e 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd0_rmbhst.txt @@ -1,183 +1,208 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:32:56 +DATE: 2024-03-01_03:20:45 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.972409e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155179e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.272541e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.697938 sec + 2,798,675,219 cycles # 3.021 GHz + 4,376,672,842 instructions # 1.56 insn per cycle + 0.983897382 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.782456e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.857581e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.857581e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.898425 sec - 13,637,022,234 cycles # 3.496 GHz - 38,385,249,434 instructions # 2.81 insn per cycle - 3.901702317 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 674) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.189575e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.254386e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.254386e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.893907 sec + 15,162,024,600 cycles # 3.096 GHz + 38,372,989,497 instructions # 2.53 insn per cycle + 4.899450957 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 668) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515645 Relative difference = 3.258803994438787e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.798533e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.038408e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.038408e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.311197 sec - 8,087,896,887 cycles # 3.496 GHz - 24,573,667,331 instructions # 3.04 insn per cycle - 2.314383849 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.704548e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.907149e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.907149e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.935182 sec + 9,091,941,153 cycles # 3.094 GHz + 24,577,519,112 instructions # 2.70 insn per cycle + 2.940777194 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2159) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.865253e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.517718e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.517718e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.455280 sec - 4,850,997,989 cycles # 3.328 GHz - 11,230,074,042 instructions # 2.32 insn per cycle - 1.458527044 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.938740e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.466662e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.466662e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.869802 sec + 5,458,289,042 cycles # 2.911 GHz + 11,250,961,339 instructions # 2.06 insn per cycle + 1.875881825 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2376) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.377231e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.116751e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.116751e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.375896 sec - 4,589,261,968 cycles # 3.330 GHz - 10,502,721,091 instructions # 2.29 insn per cycle - 1.379048846 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.493369e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.117845e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.117845e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.719311 sec + 5,034,836,824 cycles # 2.920 GHz + 10,558,271,294 instructions # 2.10 insn per cycle + 1.725057980 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2074) (512y: 144) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.291081e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.994820e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.994820e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.387507 sec - 4,623,729,787 cycles # 3.327 GHz - 7,735,191,540 instructions # 1.67 insn per cycle - 1.390703747 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.013824e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.247297e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.247297e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.716839 sec + 5,403,556,568 cycles # 1.987 GHz + 7,794,191,095 instructions # 1.44 insn per cycle + 2.722528243 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1445) (512y: 122) (512z: 1542) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt index 3c03138a4d..73356b00dd 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl0_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:10:23 +DATE: 2024-03-01_02:27:35 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.058566e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.139903e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.277694e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.538743 sec + 2,297,794,086 cycles # 2.963 GHz + 3,276,125,304 instructions # 1.43 insn per cycle + 0.856267333 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.710529e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.781570e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.781570e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.996408 sec - 13,979,413,584 cycles # 3.496 GHz - 40,196,760,217 instructions # 2.88 insn per cycle - 3.999779379 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 687) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.197217e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.262307e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.262307e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.877526 sec + 15,081,677,651 cycles # 3.089 GHz + 40,100,660,385 instructions # 2.66 insn per cycle + 4.889980594 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.077291e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.346738e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.346738e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.191848 sec - 7,669,887,605 cycles # 3.495 GHz - 23,666,455,211 instructions # 3.09 insn per cycle - 2.195224486 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.910252e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.135599e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.135599e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.787478 sec + 8,606,981,244 cycles # 3.082 GHz + 23,670,854,000 instructions # 2.75 insn per cycle + 2.801213189 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2072) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.056560e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.577761e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.577761e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.608065 sec - 5,356,503,720 cycles # 3.326 GHz - 13,039,165,916 instructions # 2.43 insn per cycle - 1.611516596 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.287623e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.696089e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.696089e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.088271 sec + 6,101,163,180 cycles # 2.915 GHz + 13,060,965,379 instructions # 2.14 insn per cycle + 2.110411764 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2546) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.286280e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.839274e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.839274e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.563652 sec - 5,212,654,568 cycles # 3.328 GHz - 12,266,825,956 instructions # 2.35 insn per cycle - 1.567147417 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.510708e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.955656e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.955656e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.007458 sec + 5,795,313,103 cycles # 2.878 GHz + 12,320,114,352 instructions # 2.13 insn per cycle + 2.035740422 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2093) (512y: 294) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.906483e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.553389e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.553389e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.449150 sec - 4,824,305,914 cycles # 3.323 GHz - 9,542,657,586 instructions # 1.98 insn per cycle - 1.452571538 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.559784e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.746127e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.746127e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.054998 sec + 5,836,990,709 cycles # 1.908 GHz + 9,601,704,067 instructions # 1.64 insn per cycle + 3.069883688 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1509) (512y: 209) (512z: 1971) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt index 25ef1d5f3d..7ca7ca6f27 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:23:40 +DATE: 2024-03-01_03:03:58 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.566149e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.156976e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.274435e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.520509 sec + 2,251,864,611 cycles # 2.979 GHz + 3,200,076,053 instructions # 1.42 insn per cycle + 0.813049887 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.192021e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.290805e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.290805e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.415579 sec - 11,947,926,572 cycles # 3.495 GHz - 34,397,405,204 instructions # 2.88 insn per cycle - 3.419004365 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.538728e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.625778e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.625778e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.235724 sec + 13,018,811,907 cycles # 3.070 GHz + 34,384,492,801 instructions # 2.64 insn per cycle + 4.241723051 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 686) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.733586e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.879812e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.879812e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.934771 sec - 10,271,141,406 cycles # 3.497 GHz - 24,003,451,169 instructions # 2.34 insn per cycle - 2.938285168 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.065411e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.209741e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.209741e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.527791 sec + 10,618,068,276 cycles # 3.005 GHz + 24,006,297,751 instructions # 2.26 insn per cycle + 3.533644608 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2582) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.994183e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.364715e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.364715e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.870369 sec - 6,224,279,915 cycles # 3.323 GHz - 12,380,406,144 instructions # 1.99 insn per cycle - 1.873883038 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.845204e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.186466e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.186466e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.268558 sec + 6,594,099,256 cycles # 2.900 GHz + 12,400,446,525 instructions # 1.88 insn per cycle + 2.274329127 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3154) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516200 Relative difference = 3.2588037208240405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.449558e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.878567e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.878567e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.751343 sec - 5,830,240,093 cycles # 3.324 GHz - 11,520,328,697 instructions # 1.98 insn per cycle - 1.754890337 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.148118e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.537652e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.537652e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.142175 sec + 6,250,159,272 cycles # 2.911 GHz + 11,574,474,977 instructions # 1.85 insn per cycle + 2.148019416 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2690) (512y: 239) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516200 Relative difference = 3.2588037208240405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.956146e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.616868e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.616868e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.441128 sec - 4,799,320,936 cycles # 3.324 GHz - 9,238,083,496 instructions # 1.92 insn per cycle - 1.444627570 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.139590e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.381511e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.381511e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.637824 sec + 5,343,225,675 cycles # 2.022 GHz + 9,294,792,947 instructions # 1.74 insn per cycle + 2.643638198 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2115) (512y: 282) (512z: 1958) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt index 94cf15e1a9..6740b658ab 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_d_inl1_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:23:56 +DATE: 2024-03-01_03:04:25 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.563128e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.158314e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.275634e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.525125 sec + 2,266,508,632 cycles # 2.999 GHz + 3,227,683,893 instructions # 1.42 insn per cycle + 0.815560561 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063388516822 +Relative difference = 3.2588034143755247e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.402676e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.515698e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.515698e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.211500 sec - 11,235,460,343 cycles # 3.496 GHz - 35,050,056,722 instructions # 3.12 insn per cycle - 3.214868359 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.686393e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.784184e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.784184e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.008193 sec + 12,350,315,150 cycles # 3.077 GHz + 35,037,181,267 instructions # 2.84 insn per cycle + 4.014100641 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 457) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515649 Relative difference = 3.258803992249869e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.645770e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.783620e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.783620e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.003938 sec - 10,512,340,009 cycles # 3.496 GHz - 23,080,735,378 instructions # 2.20 insn per cycle - 3.007803609 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.126314e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.271590e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.271590e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 3.458899 sec + 10,688,048,117 cycles # 3.085 GHz + 23,082,662,787 instructions # 2.16 insn per cycle + 3.464737128 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2363) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388515654 Relative difference = 3.2588039900609506e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.439627e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.869125e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.869125e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.750278 sec - 5,827,983,221 cycles # 3.325 GHz - 11,935,795,493 instructions # 2.05 insn per cycle - 1.753728973 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.065386e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.447820e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.447820e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.175532 sec + 6,167,789,524 cycles # 2.829 GHz + 11,956,365,830 instructions # 1.94 insn per cycle + 2.181490352 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2509) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.801848e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.279461e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.279461e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.666561 sec - 5,551,706,657 cycles # 3.326 GHz - 11,073,468,618 instructions # 1.99 insn per cycle - 1.670062494 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.355284e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.776167e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.776167e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.062589 sec + 6,012,687,929 cycles # 2.908 GHz + 11,129,506,913 instructions # 1.85 insn per cycle + 2.068524285 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2126) (512y: 174) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.282360e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.994130e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.994130e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.389411 sec - 4,628,373,888 cycles # 3.325 GHz - 8,962,575,488 instructions # 1.94 insn per cycle - 1.392904800 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.234665e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.489644e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.489644e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.581777 sec + 5,215,223,845 cycles # 2.016 GHz + 9,019,923,506 instructions # 1.73 insn per cycle + 2.587755549 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1650) (512y: 208) (512z: 1567) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063388516204 Relative difference = 3.2588037186351226e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt index a7acc63d87..3164378b7a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:10:38 +DATE: 2024-03-01_02:28:04 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.210726e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.585567e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.966482e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.485254 sec + 2,068,141,298 cycles # 2.904 GHz + 2,916,142,359 instructions # 1.41 insn per cycle + 0.784434250 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.061036e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.162907e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.162907e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.542085 sec - 12,391,424,054 cycles # 3.496 GHz - 38,265,921,403 instructions # 3.09 insn per cycle - 3.545513591 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 589) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.313091e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.389644e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.389644e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.621612 sec + 14,026,409,554 cycles # 3.032 GHz + 38,341,238,705 instructions # 2.73 insn per cycle + 4.632085783 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199022179469 Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.565544e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.063647e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.063647e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 1.710778 sec - 5,991,801,017 cycles # 3.497 GHz - 15,824,708,814 instructions # 2.64 insn per cycle - 1.714001105 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.217740e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.647077e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.647077e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.094155 sec + 6,477,656,873 cycles # 3.085 GHz + 15,815,714,256 instructions # 2.44 insn per cycle + 2.109661469 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193548331037 Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.241504e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.408345e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.408345e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 0.952958 sec - 3,189,843,324 cycles # 3.339 GHz - 7,575,039,950 instructions # 2.37 insn per cycle - 0.956152471 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.558089e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.098648e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.098648e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.180439 sec + 3,464,791,228 cycles # 2.924 GHz + 7,594,553,534 instructions # 2.19 insn per cycle + 1.196926932 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.319510e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.510809e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.510809e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 0.906095 sec - 3,034,230,865 cycles # 3.340 GHz - 7,150,811,632 instructions # 2.36 insn per cycle - 0.909351723 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.028669e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.195924e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.195924e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.103361 sec + 3,253,544,502 cycles # 2.935 GHz + 7,202,500,133 instructions # 2.21 insn per cycle + 1.115792553 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.446120e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.684000e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.684000e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 0.836297 sec - 2,802,120,149 cycles # 3.341 GHz - 5,780,771,468 instructions # 2.06 insn per cycle - 0.839556975 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.586127e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.450667e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.450667e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.467307 sec + 3,062,229,633 cycles # 2.079 GHz + 5,834,823,887 instructions # 1.91 insn per cycle + 1.480044473 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183349184692 Relative difference = 1.6508058850146622e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt index 244f7ed452..b32abcb3fe 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_bridge.txt @@ -1,188 +1,219 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:29:55 +DATE: 2024-03-01_03:14:35 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.139226e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.486374e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.486374e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 +TOTAL : 0.665285 sec + 2,679,931,908 cycles # 3.001 GHz + 4,173,181,221 instructions # 1.56 insn per cycle + 0.950193790 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.053680e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.154942e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.154942e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.570943 sec - 12,491,119,184 cycles # 3.495 GHz - 38,309,264,755 instructions # 3.07 insn per cycle - 3.574697502 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 589) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.339175e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.415593e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.415593e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.608146 sec + 14,198,803,048 cycles # 3.078 GHz + 38,383,841,480 instructions # 2.70 insn per cycle + 4.614561058 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199022179469 Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.494756e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.984731e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.984731e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 1.760397 sec - 6,151,072,141 cycles # 3.488 GHz - 16,106,098,708 instructions # 2.62 insn per cycle - 1.764265698 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.150361e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.574288e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.574288e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.164951 sec + 6,682,648,138 cycles # 3.079 GHz + 16,095,511,662 instructions # 2.41 insn per cycle + 2.171478460 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193548331037 Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.220386e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.382151e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.382151e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 0.999164 sec - 3,353,390,197 cycles # 3.346 GHz - 7,813,166,304 instructions # 2.33 insn per cycle - 1.003041613 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.377335e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.075060e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.075060e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.245724 sec + 3,655,872,382 cycles # 2.921 GHz + 7,830,960,228 instructions # 2.14 insn per cycle + 1.252058919 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.301043e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.484531e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.484531e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 0.948601 sec - 3,185,245,475 cycles # 3.348 GHz - 7,388,936,707 instructions # 2.32 insn per cycle - 0.952384757 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.884024e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.146718e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.146718e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.193275 sec + 3,439,455,837 cycles # 2.869 GHz + 7,440,735,686 instructions # 2.16 insn per cycle + 1.199824293 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=524288) -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.412343e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.637273e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.637273e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 0.886349 sec - 2,975,186,907 cycles # 3.345 GHz - 6,036,018,349 instructions # 2.03 insn per cycle - 0.890113209 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.445766e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.274506e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.274506e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.539244 sec + 3,276,504,779 cycles # 2.121 GHz + 6,089,433,455 instructions # 1.86 insn per cycle + 1.545785864 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183349184692 Relative difference = 1.6508058850146622e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt index d4cdb5a44f..1418229a2f 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_common.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:35:36 +DATE: 2024-03-01_03:27:44 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.472574e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.636713e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.962164e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.079446e+00 +- 3.403306e-03 ) GeV^0 +TOTAL : 0.558880 sec + 2,364,095,478 cycles # 3.003 GHz + 3,484,344,192 instructions # 1.47 insn per cycle + 0.845198156 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.058975e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.161124e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.161124e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.358072e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.436073e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.436073e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.544825 sec - 12,394,762,036 cycles # 3.494 GHz - 38,265,645,755 instructions # 3.09 insn per cycle - 3.547916257 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 589) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 4.585598 sec + 14,172,267,813 cycles # 3.088 GHz + 38,370,669,897 instructions # 2.71 insn per cycle + 4.590984697 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199022179469 Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.488571e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.974371e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.974371e+05 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.211957e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.640936e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.640936e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 1.729685 sec - 6,044,983,083 cycles # 3.490 GHz - 15,825,361,332 instructions # 2.62 insn per cycle - 1.732686150 seconds time elapsed +TOTAL : 2.148796 sec + 6,634,619,629 cycles # 3.081 GHz + 15,827,825,218 instructions # 2.39 insn per cycle + 2.154083020 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193548331037 Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.240146e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.410041e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.410041e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.547921e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.095970e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.095970e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 0.954496 sec - 3,193,272,434 cycles # 3.337 GHz - 7,575,303,386 instructions # 2.37 insn per cycle - 0.957550268 seconds time elapsed +TOTAL : 1.236002 sec + 3,624,228,310 cycles # 2.921 GHz + 7,577,923,207 instructions # 2.09 insn per cycle + 1.241371528 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.319772e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.511858e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.511858e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.019099e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.183109e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.183109e+06 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 0.905312 sec - 3,033,184,068 cycles # 3.342 GHz - 7,150,791,405 instructions # 2.36 insn per cycle - 0.908396432 seconds time elapsed +TOTAL : 1.166800 sec + 3,412,475,771 cycles # 2.913 GHz + 7,154,107,852 instructions # 2.10 insn per cycle + 1.172143118 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.446800e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.685435e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.685435e+06 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.590832e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.447342e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.447342e+05 ) sec^-1 MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 0.835810 sec - 2,801,569,225 cycles # 3.343 GHz - 5,780,758,142 instructions # 2.06 insn per cycle - 0.838834457 seconds time elapsed +TOTAL : 1.519807 sec + 3,228,336,001 cycles # 2.118 GHz + 5,784,936,071 instructions # 1.79 insn per cycle + 1.525231071 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183349184692 Relative difference = 1.6508058850146622e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt index 54a85a955c..6cc1ea482a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_curhst.txt @@ -1,138 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:34:19 +DATE: 2024-03-01_03:24:27 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.444388e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.637591e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.958095e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.506234 sec + 2,151,061,698 cycles # 2.964 GHz + 3,317,932,316 instructions # 1.54 insn per cycle + 0.783859096 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe: Aborted - 4,538,068 cycles # 3.245 GHz - 6,262,611 instructions # 1.38 insn per cycle - 0.037803610 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 589) (avx2: 0) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.348187e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.425786e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.425786e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.550945 sec + 14,020,959,724 cycles # 3.078 GHz + 38,340,893,799 instructions # 2.73 insn per cycle + 4.556370309 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199022179469 Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe: Aborted - 4,534,363 cycles # 3.253 GHz - 6,314,690 instructions # 1.39 insn per cycle - 0.037873298 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.084306e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.497288e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.497288e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.146528 sec + 6,470,246,026 cycles # 3.008 GHz + 15,815,477,798 instructions # 2.44 insn per cycle + 2.151761392 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193548331037 Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe: Aborted - 4,637,775 cycles # 3.255 GHz - 6,328,952 instructions # 1.36 insn per cycle - 0.038380049 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.654131e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.108425e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.108425e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.168173 sec + 3,446,745,579 cycles # 2.939 GHz + 7,593,552,481 instructions # 2.20 insn per cycle + 1.173417445 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe: Aborted - 4,573,961 cycles # 3.257 GHz - 6,321,016 instructions # 1.38 insn per cycle - 0.037677949 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.035097e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.201064e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.201064e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.096134 sec + 3,246,063,667 cycles # 2.949 GHz + 7,201,559,823 instructions # 2.22 insn per cycle + 1.101526557 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe: Aborted - 4,502,378 cycles # 3.260 GHz - 6,322,124 instructions # 1.40 insn per cycle - 0.037792372 seconds time elapsed +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.601752e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.455480e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.455480e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.463312 sec + 3,061,733,109 cycles # 2.086 GHz + 5,833,735,363 instructions # 1.91 insn per cycle + 1.468683964 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183349184692 Relative difference = 1.6508058850146622e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt index c1a03aebbd..d1c301e36a 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd0_rmbhst.txt @@ -1,183 +1,208 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:33:11 +DATE: 2024-03-01_03:21:13 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 2 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.521212e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.620937e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.942141e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086805e+00 +- 3.414078e-03 ) GeV^0 +TOTAL : 0.625420 sec + 2,414,961,393 cycles # 2.854 GHz + 3,791,061,685 instructions # 1.57 insn per cycle + 0.904442863 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.061154e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.162932e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.162932e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.542051 sec - 12,388,794,866 cycles # 3.495 GHz - 38,264,992,582 instructions # 3.09 insn per cycle - 3.545107831 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 589) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.328946e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.404018e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.404018e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.586154 sec + 14,183,213,679 cycles # 3.090 GHz + 38,341,040,102 instructions # 2.70 insn per cycle + 4.591510537 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 587) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199022179469 Relative difference = 4.819651478256564e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.542448e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.037442e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.037442e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 1.716038 sec - 6,003,936,148 cycles # 3.494 GHz - 15,824,627,159 instructions # 2.64 insn per cycle - 1.719029866 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.242078e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.670922e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.670922e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 2.084805 sec + 6,467,654,599 cycles # 3.095 GHz + 15,814,952,627 instructions # 2.45 insn per cycle + 2.090234852 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2690) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193548331037 Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.242773e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.410199e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.410199e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 0.952242 sec - 3,187,363,454 cycles # 3.339 GHz - 7,575,422,549 instructions # 2.38 insn per cycle - 0.955261247 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.553311e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.096092e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.096092e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.181028 sec + 3,453,301,700 cycles # 2.913 GHz + 7,593,575,205 instructions # 2.20 insn per cycle + 1.186225517 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3049) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.320310e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.510744e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.510744e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 0.905497 sec - 3,033,662,646 cycles # 3.342 GHz - 7,150,728,059 instructions # 2.36 insn per cycle - 0.908547463 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.023252e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.188398e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.188398e+06 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.108864 sec + 3,247,038,827 cycles # 2.916 GHz + 7,202,168,264 instructions # 2.22 insn per cycle + 1.114391762 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2849) (512y: 23) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181684445590 Relative difference = 8.302595855806234e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe -p 2048 256 2 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.448049e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.685618e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.685618e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 0.835256 sec - 2,796,643,852 cycles # 3.341 GHz - 5,780,733,558 instructions # 2.07 insn per cycle - 0.838315041 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.596256e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.449431e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.449431e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.464294 sec + 3,059,603,183 cycles # 2.083 GHz + 5,833,854,527 instructions # 1.91 insn per cycle + 1.469681735 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2363) (512y: 24) (512z: 1889) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183349184692 Relative difference = 1.6508058850146622e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt index 8b91db0e17..adc2ed2114 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl0_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:10:51 +DATE: 2024-03-01_02:28:27 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.323457e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.629602e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.019308e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.480923 sec + 2,116,431,851 cycles # 3.003 GHz + 3,022,655,895 instructions # 1.43 insn per cycle + 0.777218279 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.113653e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.221073e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.221073e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.483745 sec - 12,191,233,427 cycles # 3.497 GHz - 39,807,639,093 instructions # 3.27 insn per cycle - 3.486969433 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 580) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.299655e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.373045e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.373045e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.644587 sec + 14,360,257,758 cycles # 3.089 GHz + 39,833,716,550 instructions # 2.77 insn per cycle + 4.652300252 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 570) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199028000236 Relative difference = 4.790961076489297e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.134346e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.923260e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.923260e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 1.402390 sec - 4,912,248,574 cycles # 3.497 GHz - 15,294,202,423 instructions # 3.11 insn per cycle - 1.405648629 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.819246e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.374211e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.374211e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.888755 sec + 5,601,188,109 cycles # 2.957 GHz + 15,285,931,975 instructions # 2.73 insn per cycle + 1.901754882 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2474) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193548331037 Relative difference = 1.748963824709674e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.083568e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.948403e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.948403e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.262664 sec - 4,211,226,733 cycles # 3.329 GHz - 9,715,508,285 instructions # 2.31 insn per cycle - 1.265920918 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.809980e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.511061e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.511061e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.623137 sec + 4,755,173,593 cycles # 2.919 GHz + 9,735,141,159 instructions # 2.05 insn per cycle + 1.639641207 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3708) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288182108197361 Relative difference = 1.0391259163456515e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.245949e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.013771e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.013771e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.245932 sec - 4,154,573,718 cycles # 3.329 GHz - 9,273,579,239 instructions # 2.23 insn per cycle - 1.249125651 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.976796e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.708401e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.708401e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.586631 sec + 4,632,931,570 cycles # 2.912 GHz + 9,326,747,974 instructions # 2.01 insn per cycle + 1.599475417 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3496) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288182108197361 Relative difference = 1.0391259163456515e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.182229e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.339411e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.339411e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 0.997929 sec - 3,329,282,294 cycles # 3.328 GHz - 6,979,885,802 instructions # 2.10 insn per cycle - 1.001225403 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.246902e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.812329e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.812329e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.762945 sec + 3,668,593,409 cycles # 2.074 GHz + 7,034,535,336 instructions # 1.92 insn per cycle + 1.779301540 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2605) (512y: 12) (512z: 2221) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183459779248 Relative difference = 1.7053177021099307e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt index 7b61726f5d..82aee2242c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:24:11 +DATE: 2024-03-01_03:04:53 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.193238e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.649659e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.969705e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.478757 sec + 2,104,839,063 cycles # 2.996 GHz + 2,995,662,279 instructions # 1.42 insn per cycle + 0.760483148 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.274947e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.391721e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.391721e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.318252 sec - 11,611,007,117 cycles # 3.497 GHz - 34,397,967,949 instructions # 2.96 insn per cycle - 3.321458141 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.482809e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.574079e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.574079e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 4.311067 sec + 12,598,770,011 cycles # 2.919 GHz + 34,372,549,657 instructions # 2.73 insn per cycle + 4.316594695 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 696) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199094356969 Relative difference = 4.463890496342449e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.009263e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.579301e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.579301e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 1.609519 sec - 5,634,626,051 cycles # 3.496 GHz - 14,869,650,453 instructions # 2.64 insn per cycle - 1.612769577 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.536780e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.027176e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.027176e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.978899 sec + 6,105,197,866 cycles # 3.078 GHz + 14,859,942,037 instructions # 2.43 insn per cycle + 1.984598314 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3009) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193803280592 Relative difference = 1.8746278463897685e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.363912e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.029612e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.029612e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.228350 sec - 4,099,462,044 cycles # 3.331 GHz - 9,010,087,189 instructions # 2.20 insn per cycle - 1.231663795 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.439196e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.305375e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.305375e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.494763 sec + 4,316,279,907 cycles # 2.878 GHz + 9,028,948,283 instructions # 2.09 insn per cycle + 1.500523975 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4443) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181999931112 Relative difference = 9.857617164523888e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.733550e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.074728e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.074728e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.188251 sec - 3,967,821,126 cycles # 3.332 GHz - 8,612,196,683 instructions # 2.17 insn per cycle - 1.191524344 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.366245e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.235578e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.235578e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.509333 sec + 4,207,142,397 cycles # 2.778 GHz + 8,663,183,236 instructions # 2.06 insn per cycle + 1.515104262 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4243) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288181999931112 Relative difference = 9.857617164523888e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.076849e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.204629e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.204629e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.084935 sec - 3,621,365,276 cycles # 3.330 GHz - 7,753,075,474 instructions # 2.14 insn per cycle - 1.088240742 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.816959e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.308753e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.308753e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.886655 sec + 3,832,564,290 cycles # 2.026 GHz + 7,807,000,610 instructions # 2.04 insn per cycle + 1.892395760 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4419) (512y: 0) (512z: 2556) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183246739209 Relative difference = 1.6003107281264138e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt index 61d1db8a51..dda1db1b3c 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_f_inl1_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:24:24 +DATE: 2024-03-01_03:05:16 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.270822e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.690662e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.026451e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086718e+00 +- 3.413389e-03 ) GeV^0 +TOTAL : 0.478497 sec + 2,092,584,267 cycles # 2.987 GHz + 2,982,481,806 instructions # 1.43 insn per cycle + 0.759974164 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 127 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028811e+00 +Avg ME (F77/CUDA) = 2.0288499749731272 +Relative difference = 1.9210746159747678e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.548926e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.686522e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.686522e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079573e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 3.071996 sec - 10,748,097,201 cycles # 3.497 GHz - 35,134,305,151 instructions # 3.27 insn per cycle - 3.075247886 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.703982e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.806761e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.806761e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086780e+00 +- 3.413794e-03 ) GeV^0 +TOTAL : 3.962914 sec + 11,745,545,496 cycles # 2.960 GHz + 35,108,793,810 instructions # 2.99 insn per cycle + 3.968579892 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 470) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028820e+00 Avg ME (F77/C++) = 2.0288199094356969 Relative difference = 4.463890496342449e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.177278e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.785505e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.785505e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079572e+00 +- 3.404712e-03 ) GeV^0 -TOTAL : 1.574710 sec - 5,513,686,982 cycles # 3.496 GHz - 14,479,421,661 instructions # 2.63 insn per cycle - 1.578018917 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.697555e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.224866e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.224866e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086779e+00 +- 3.413793e-03 ) GeV^0 +TOTAL : 1.925244 sec + 5,962,598,726 cycles # 3.089 GHz + 14,469,931,867 instructions # 2.43 insn per cycle + 1.931094914 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2572) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028819e+00 Avg ME (F77/C++) = 2.0288193583255634 Relative difference = 1.7661780742548925e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.002551e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.111042e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.111042e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.154884 sec - 3,857,962,551 cycles # 3.333 GHz - 8,855,937,803 instructions # 2.30 insn per cycle - 1.158168839 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.546151e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.447291e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.447291e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.475701 sec + 4,155,772,808 cycles # 2.809 GHz + 8,874,967,057 instructions # 2.14 insn per cycle + 1.481449825 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3574) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288182107033208 Relative difference = 1.0385521077446488e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.549625e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.050633e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.050633e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404207e-03 ) GeV^0 -TOTAL : 1.209387 sec - 4,037,720,483 cycles # 3.332 GHz - 8,359,219,935 instructions # 2.07 insn per cycle - 1.212702560 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.932743e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.882289e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.882289e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.405788 sec + 4,123,527,517 cycles # 2.923 GHz + 8,411,119,259 instructions # 2.04 insn per cycle + 1.411551419 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3319) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288182107033208 Relative difference = 1.0385521077446488e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.088722e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.221112e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.221112e+06 ) sec^-1 -MeanMatrixElemValue = ( 2.079550e+00 +- 3.404208e-03 ) GeV^0 -TOTAL : 1.073794 sec - 3,584,683,392 cycles # 3.331 GHz - 7,646,262,060 instructions # 2.13 insn per cycle - 1.077090879 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.930692e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.444813e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.444813e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086810e+00 +- 3.414231e-03 ) GeV^0 +TOTAL : 1.851731 sec + 3,787,634,254 cycles # 2.040 GHz + 7,699,934,932 instructions # 2.03 insn per cycle + 1.857323010 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3435) (512y: 0) (512z: 2108) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028818e+00 Avg ME (F77/C++) = 2.0288183204829693 Relative difference = 1.5796536184903122e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt index 4c26b84629..9748a5aab4 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd0.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:11:04 +DATE: 2024-03-01_02:28:51 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.029545e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.136839e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.273391e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.526886 sec + 2,307,341,508 cycles # 3.024 GHz + 3,271,429,537 instructions # 1.42 insn per cycle + 0.836809323 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 214 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063423243874 +Relative difference = 3.241686432649386e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.726645e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.798326e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.798326e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 3.974088 sec - 13,901,240,398 cycles # 3.496 GHz - 38,519,700,643 instructions # 2.77 insn per cycle - 3.977444803 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 678) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.174399e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.238464e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.238464e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.926720 sec + 15,303,062,403 cycles # 3.103 GHz + 38,574,821,235 instructions # 2.52 insn per cycle + 4.935986004 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 672) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.888376e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.137236e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.137236e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.271352 sec - 7,950,613,495 cycles # 3.496 GHz - 24,219,096,177 instructions # 3.05 insn per cycle - 2.274776261 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.750432e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.964332e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.964332e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.903163 sec + 8,984,859,488 cycles # 3.089 GHz + 24,224,163,348 instructions # 2.70 insn per cycle + 2.918366508 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2188) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.937592e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.600519e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.600519e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.442766 sec - 4,812,419,685 cycles # 3.329 GHz - 11,255,351,127 instructions # 2.34 insn per cycle - 1.446223272 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.977342e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.518236e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.518236e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.860423 sec + 5,396,289,064 cycles # 2.891 GHz + 11,276,510,611 instructions # 2.09 insn per cycle + 1.875091896 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2480) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.502777e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.260895e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.260895e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.356803 sec - 4,529,795,443 cycles # 3.332 GHz - 10,470,222,537 instructions # 2.31 insn per cycle - 1.360332208 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.792892e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.469147e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.469147e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 1.648151 sec + 4,836,682,110 cycles # 2.924 GHz + 10,524,586,299 instructions # 2.18 insn per cycle + 1.662467551 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2167) (512y: 148) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.442635e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.183690e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.183690e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.366107 sec - 4,556,171,739 cycles # 3.329 GHz - 7,545,350,102 instructions # 1.66 insn per cycle - 1.369560048 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.224142e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.479514e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.479514e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.587933 sec + 5,228,382,592 cycles # 2.016 GHz + 7,603,380,674 instructions # 1.45 insn per cycle + 2.604403134 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1633) (512y: 126) (512z: 1608) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 diff --git a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt index ff51ce9edf..4c3bdeb3a7 100644 --- a/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggtt_mad/log_ggtt_mad_m_inl0_hrd1.txt @@ -1,183 +1,206 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx' -DATE: 2024-03-01_19:11:19 +DATE: 2024-03-01_02:29:18 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.025642e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.140563e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.276898e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 0.529654 sec + 2,293,467,091 cycles # 2.992 GHz + 3,241,408,242 instructions # 1.41 insn per cycle + 0.836485234 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 208 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 2.028807e+00 +Avg ME (F77/CUDA) = 2.0288063423243874 +Relative difference = 3.241686432649386e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.696678e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.767700e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.767700e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 4.016604 sec - 14,052,871,002 cycles # 3.497 GHz - 40,347,826,232 instructions # 2.87 insn per cycle - 4.019950872 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 683) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.144775e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.207356e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.207356e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 4.994421 sec + 15,338,753,655 cycles # 3.068 GHz + 40,369,233,372 instructions # 2.63 insn per cycle + 5.002383718 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 669) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.079311e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.348964e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.348964e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 2.189998 sec - 7,660,292,335 cycles # 3.494 GHz - 23,249,177,880 instructions # 3.04 insn per cycle - 2.193387914 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.003325e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.239627e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.239627e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.723159 sec + 8,478,435,163 cycles # 3.107 GHz + 23,253,497,249 instructions # 2.74 insn per cycle + 2.738604338 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 2091) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288063903750300 Relative difference = 3.0048445715164216e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.802190e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.284263e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.284263e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.662935 sec - 5,537,885,797 cycles # 3.326 GHz - 12,941,082,077 instructions # 2.34 insn per cycle - 1.666384267 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.181118e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.571113e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.571113e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.127824 sec + 6,241,547,842 cycles # 2.925 GHz + 12,962,413,577 instructions # 2.08 insn per cycle + 2.144515260 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2669) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.133486e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 7.661662e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 7.661662e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.593841 sec - 5,310,193,730 cycles # 3.327 GHz - 12,185,581,580 instructions # 2.29 insn per cycle - 1.597248619 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.322331e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.729304e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.729304e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.074458 sec + 5,923,278,346 cycles # 2.853 GHz + 12,242,730,346 instructions # 2.07 insn per cycle + 2.086429072 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2209) (512y: 296) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe -p 2048 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.179196e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.881876e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.881876e+05 ) sec^-1 -MeanMatrixElemValue = ( 2.079401e+00 +- 3.402993e-03 ) GeV^0 -TOTAL : 1.404473 sec - 4,673,986,503 cycles # 3.322 GHz - 8,684,795,820 instructions # 1.86 insn per cycle - 1.407876338 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.899734e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.116034e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.116034e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.086689e+00 +- 3.413217e-03 ) GeV^0 +TOTAL : 2.794263 sec + 5,618,790,292 cycles # 2.007 GHz + 8,743,459,975 instructions # 1.56 insn per cycle + 2.808786612 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1490) (512y: 183) (512z: 1909) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_tt.mad/SubProcesses/P1_gg_ttx/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 2.028807e+00 Avg ME (F77/C++) = 2.0288064057068964 Relative difference = 2.9292737240031234e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt index d6ad8dae6d..c4c4bff630 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_19:11:34 +DATE: 2024-03-01_02:29:46 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.473707e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.045050e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.061478e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.463329 sec + 2,069,832,304 cycles # 3.002 GHz + 2,918,096,235 instructions # 1.41 insn per cycle + 0.772559551 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.045387e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.319438e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.336268e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.608947 sec + 2,562,374,732 cycles # 3.012 GHz + 3,879,371,783 instructions # 1.51 insn per cycle + 0.910123971 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.413122e+00 +Avg ME (F77/CUDA) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.210168e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.225481e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.225481e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.129663 sec - 17,942,023,952 cycles # 3.497 GHz - 59,485,012,848 instructions # 3.32 insn per cycle - 5.132002080 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1439) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.585844e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.598254e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.598254e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.359535 sec + 19,687,428,773 cycles # 3.094 GHz + 59,604,296,849 instructions # 3.03 insn per cycle + 6.365859123 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.227836e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.283842e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.283842e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.655716 sec - 9,284,099,234 cycles # 3.494 GHz - 30,662,650,809 instructions # 3.30 insn per cycle - 2.658111464 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.691737e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.735631e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.735631e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.515479 sec + 10,373,655,779 cycles # 2.948 GHz + 30,676,465,519 instructions # 2.96 insn per cycle + 3.528584808 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.296898e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.319348e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.319348e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.287545 sec - 4,259,821,564 cycles # 3.305 GHz - 11,003,859,405 instructions # 2.58 insn per cycle - 1.289855021 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.754839e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.932602e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.932602e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.702212 sec + 4,885,421,396 cycles # 2.863 GHz + 11,020,224,832 instructions # 2.26 insn per cycle + 1.717667988 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.392823e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.418968e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.418968e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.201286 sec - 3,975,596,259 cycles # 3.305 GHz - 10,273,857,845 instructions # 2.58 insn per cycle - 1.203631834 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.095884e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.117707e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.117707e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.517268 sec + 4,368,757,303 cycles # 2.872 GHz + 10,296,904,442 instructions # 2.36 insn per cycle + 1.532957385 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.911241e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.961026e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.961026e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.882693 sec - 2,920,749,975 cycles # 3.303 GHz - 5,817,985,024 instructions # 1.99 insn per cycle - 0.885121704 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.761348e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.875289e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.875289e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.135983 sec + 4,101,318,849 cycles # 1.917 GHz + 5,843,401,136 instructions # 1.42 insn per cycle + 2.151041040 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt index 105cbe8fdc..7a80a6327c 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd0_bridge.txt @@ -1,188 +1,237 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_19:30:08 +DATE: 2024-03-01_03:14:59 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.634181e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.802665e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.802665e+06 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.494713 sec + 2,059,588,733 cycles # 2.926 GHz + 3,067,379,574 instructions # 1.49 insn per cycle + 0.764554853 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.715023e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.440232e+06 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.440232e+06 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.824199 sec + 3,179,114,916 cycles # 2.965 GHz + 5,069,610,946 instructions # 1.59 insn per cycle + 1.133521853 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.413122e+00 +Avg ME (F77/CUDA) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.215926e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.231294e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.231294e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.124663 sec - 17,917,725,694 cycles # 3.495 GHz - 59,493,244,220 instructions # 3.32 insn per cycle - 5.127126814 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1439) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.525402e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.537809e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.537809e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.518056 sec + 19,750,480,394 cycles # 3.028 GHz + 59,611,727,500 instructions # 3.02 insn per cycle + 6.522447301 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1466) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.222963e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.278776e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.278776e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.663023 sec - 9,311,853,268 cycles # 3.495 GHz - 30,712,911,054 instructions # 3.30 insn per cycle - 2.665569418 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.903232e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.949588e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.949588e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.370584 sec + 10,396,817,898 cycles # 3.081 GHz + 30,723,473,589 instructions # 2.96 insn per cycle + 3.375008450 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5153) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.294133e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.316415e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.316415e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.295128 sec - 4,286,401,261 cycles # 3.305 GHz - 11,055,645,863 instructions # 2.58 insn per cycle - 1.297609303 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.888216e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.006946e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.006946e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.685691 sec + 4,902,930,827 cycles # 2.902 GHz + 11,066,989,869 instructions # 2.26 insn per cycle + 1.690115997 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4467) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.391259e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.417196e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.417196e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.206857 sec - 3,995,174,773 cycles # 3.306 GHz - 10,322,661,719 instructions # 2.58 insn per cycle - 1.209297943 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.103682e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.126401e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.126401e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.513774 sec + 4,402,683,305 cycles # 2.901 GHz + 10,346,890,880 instructions # 2.35 insn per cycle + 1.518250177 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4137) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.907041e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.955766e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.955766e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.888605 sec - 2,942,234,200 cycles # 3.305 GHz - 5,854,009,429 instructions # 1.99 insn per cycle - 0.891034303 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.798042e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.913691e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.913691e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.132010 sec + 4,131,468,761 cycles # 1.935 GHz + 5,881,941,509 instructions # 1.42 insn per cycle + 2.136586909 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1540) (512y: 95) (512z: 3466) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt index 6001d7f706..90bf6e6455 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_d_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_19:11:50 +DATE: 2024-03-01_02:30:15 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.404765e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.032804e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.048930e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.465265 sec + 2,029,896,808 cycles # 2.980 GHz + 2,854,741,238 instructions # 1.41 insn per cycle + 0.763772288 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.033730e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.306062e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.322624e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.607194 sec + 2,545,937,909 cycles # 2.996 GHz + 3,826,405,631 instructions # 1.50 insn per cycle + 0.909330494 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.413122e+00 +Avg ME (F77/CUDA) = 1.4131213684418649 +Relative difference = 4.469239988637851e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.239919e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.255416e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.255416e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.082609 sec - 17,776,981,365 cycles # 3.497 GHz - 58,797,467,293 instructions # 3.31 insn per cycle - 5.084873713 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1323) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.602792e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.615496e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.615496e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.317260 sec + 19,445,883,412 cycles # 3.076 GHz + 58,795,735,881 instructions # 3.02 insn per cycle + 6.323702590 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1313) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.233270e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.289870e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.289870e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.653124 sec - 9,279,704,528 cycles # 3.496 GHz - 30,333,640,947 instructions # 3.27 insn per cycle - 2.655395297 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.903926e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.950247e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.950247e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.363533 sec + 10,256,448,579 cycles # 3.046 GHz + 30,347,165,405 instructions # 2.96 insn per cycle + 3.377280590 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 4970) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684432433 Relative difference = 4.46923023397472e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.249037e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.269785e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.269785e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.336298 sec - 4,419,712,116 cycles # 3.304 GHz - 11,468,590,102 instructions # 2.59 insn per cycle - 1.338622545 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.598787e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.768674e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.768674e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.728674 sec + 5,043,692,461 cycles # 2.911 GHz + 11,484,727,811 instructions # 2.28 insn per cycle + 1.738921569 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4591) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.321449e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.345024e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.345024e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.264700 sec - 4,183,889,364 cycles # 3.304 GHz - 10,818,738,539 instructions # 2.59 insn per cycle - 1.266987637 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.033952e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.054066e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.054066e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.607009 sec + 4,642,681,786 cycles # 2.882 GHz + 10,842,961,046 instructions # 2.34 insn per cycle + 1.618440779 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4183) (512y: 244) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.893042e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.941104e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.941104e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.890295 sec - 2,945,907,544 cycles # 3.303 GHz - 6,079,729,396 instructions # 2.06 insn per cycle - 0.892645568 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.765124e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.875111e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.875111e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.134046 sec + 4,109,311,958 cycles # 1.922 GHz + 6,106,472,133 instructions # 1.49 insn per cycle + 2.145705149 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1457) (512y: 139) (512z: 3568) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213684416484 Relative difference = 4.469241520660492e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt index b334186e41..af4f474b65 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_19:12:06 +DATE: 2024-03-01_02:30:44 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.308616e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.230427e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.340211e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 +TOTAL : 0.445727 sec + 2,001,558,197 cycles # 3.000 GHz + 2,820,746,449 instructions # 1.41 insn per cycle + 0.736568143 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.061859e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.424190e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.524056e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 +TOTAL : 0.500107 sec + 2,158,124,631 cycles # 2.977 GHz + 3,092,829,809 instructions # 1.43 insn per cycle + 0.784432881 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412608e+00 +Avg ME (F77/CUDA) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.311464e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.328435e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.328435e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 4.972112 sec - 17,386,645,192 cycles # 3.496 GHz - 58,906,569,186 instructions # 3.39 insn per cycle - 4.974344840 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1027) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.674607e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.688116e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.688116e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.146873 sec + 19,061,096,774 cycles # 3.099 GHz + 58,958,014,215 instructions # 3.09 insn per cycle + 6.153306662 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 Avg ME (F77/C++) = 1.4129858051842916 Relative difference = 1.3787518662898538e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.100537e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.119548e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.119548e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724758e+02 +- 2.665339e+02 ) GeV^-2 -TOTAL : 1.511800 sec - 5,288,183,416 cycles # 3.495 GHz - 16,687,010,204 instructions # 3.16 insn per cycle - 1.514049543 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.781065e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.932207e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.932207e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 1.886682 sec + 5,850,782,122 cycles # 3.096 GHz + 16,695,269,066 instructions # 2.85 insn per cycle + 1.898716135 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412987e+00 Avg ME (F77/C++) = 1.4129865669244737 Relative difference = 3.06496469061158e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.462224e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.557798e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.557798e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743722e+02 +- 2.676604e+02 ) GeV^-2 -TOTAL : 0.688318 sec - 2,281,074,327 cycles # 3.307 GHz - 5,967,081,727 instructions # 2.62 insn per cycle - 0.690552036 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.892145e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.960485e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.960485e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.886334 sec + 2,581,461,055 cycles # 2.900 GHz + 5,980,838,355 instructions # 2.32 insn per cycle + 0.901108038 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133161655815059 Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.669958e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.775682e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.775682e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743722e+02 +- 2.676604e+02 ) GeV^-2 -TOTAL : 0.637008 sec - 2,111,600,396 cycles # 3.307 GHz - 5,581,688,810 instructions # 2.64 insn per cycle - 0.639312316 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.036523e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.118274e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.118274e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.825324 sec + 2,349,134,788 cycles # 2.832 GHz + 5,603,128,082 instructions # 2.39 insn per cycle + 0.837493797 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133161655815059 Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.932137e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.167092e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.167092e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743730e+02 +- 2.676609e+02 ) GeV^-2 -TOTAL : 0.440183 sec - 1,458,007,499 cycles # 3.301 GHz - 3,311,390,347 instructions # 2.27 insn per cycle - 0.442462707 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.468368e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.511305e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.511305e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.138775 sec + 2,054,810,359 cycles # 1.798 GHz + 3,334,038,485 instructions # 1.62 insn per cycle + 1.149410848 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2131) (512y: 39) (512z: 3668) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133164033579249 Relative difference = 2.85398258307829e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt index dc7aef3b25..f62f4c8cdf 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd0_bridge.txt @@ -1,188 +1,237 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_19:30:24 +DATE: 2024-03-01_03:15:29 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.995753e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.112595e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.112595e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.009071e+02 +- 5.002295e+01 ) GeV^-2 +TOTAL : 0.451281 sec + 1,977,131,537 cycles # 2.986 GHz + 2,910,150,577 instructions # 1.47 insn per cycle + 0.718929629 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 254 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.708417e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.567455e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.567455e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.737500e+02 +- 4.776370e+02 ) GeV^-2 +TOTAL : 0.637857 sec + 2,608,085,808 cycles # 2.999 GHz + 3,961,129,191 instructions # 1.52 insn per cycle + 0.928114705 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412608e+00 +Avg ME (F77/CUDA) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.313699e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.330593e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.330593e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 4.970690 sec - 17,398,259,542 cycles # 3.499 GHz - 58,910,916,539 instructions # 3.39 insn per cycle - 4.972903941 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1027) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.667614e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.681311e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.681311e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.166590 sec + 19,068,958,964 cycles # 3.091 GHz + 58,962,429,433 instructions # 3.09 insn per cycle + 6.170849448 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1034) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 Avg ME (F77/C++) = 1.4129858051842916 Relative difference = 1.3787518662898538e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.100372e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.119384e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.119384e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724758e+02 +- 2.665339e+02 ) GeV^-2 -TOTAL : 1.515113 sec - 5,305,885,790 cycles # 3.498 GHz - 16,735,297,320 instructions # 3.15 insn per cycle - 1.517362548 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.742153e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.893438e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.893438e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 1.898339 sec + 5,876,062,473 cycles # 3.090 GHz + 16,741,995,731 instructions # 2.85 insn per cycle + 1.902713080 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5765) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412987e+00 Avg ME (F77/C++) = 1.4129865669244737 Relative difference = 3.06496469061158e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.343773e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.429942e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.429942e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743722e+02 +- 2.676604e+02 ) GeV^-2 -TOTAL : 0.724732 sec - 2,402,094,625 cycles # 3.307 GHz - 6,003,946,640 instructions # 2.50 insn per cycle - 0.727046765 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.880787e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.949754e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.949754e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.895765 sec + 2,600,620,319 cycles # 2.891 GHz + 6,016,590,564 instructions # 2.31 insn per cycle + 0.900189489 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133161655815059 Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.660234e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.764784e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.764784e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743722e+02 +- 2.676604e+02 ) GeV^-2 -TOTAL : 0.642041 sec - 2,128,904,938 cycles # 3.308 GHz - 5,618,859,132 instructions # 2.64 insn per cycle - 0.644359266 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.084629e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.167676e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.167676e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.810420 sec + 2,363,958,510 cycles # 2.904 GHz + 5,639,045,986 instructions # 2.39 insn per cycle + 0.814799834 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4645) (512y: 36) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133161655815059 Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.928773e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.164302e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.164302e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743730e+02 +- 2.676609e+02 ) GeV^-2 -TOTAL : 0.443516 sec - 1,473,489,767 cycles # 3.311 GHz - 3,352,410,215 instructions # 2.28 insn per cycle - 0.445839527 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.603454e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.652417e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.652417e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.048212 sec + 2,071,251,869 cycles # 1.970 GHz + 3,374,799,702 instructions # 1.63 insn per cycle + 1.052574627 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2131) (512y: 39) (512z: 3668) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133164033579249 Relative difference = 2.85398258307829e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt index 15e9b2f227..b43a9401e8 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_f_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_19:12:19 +DATE: 2024-03-01_02:31:09 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.359219e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.312667e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.422625e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008472e+02 +- 5.002447e+01 ) GeV^-2 +TOTAL : 0.446885 sec + 1,972,174,797 cycles # 2.962 GHz + 2,746,314,290 instructions # 1.39 insn per cycle + 0.738224654 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 248 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.060800e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.419962e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.520064e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.630099e+02 +- 4.770719e+02 ) GeV^-2 +TOTAL : 0.497273 sec + 2,176,246,033 cycles # 3.004 GHz + 3,133,180,341 instructions # 1.44 insn per cycle + 0.782102946 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.412608e+00 +Avg ME (F77/CUDA) = 1.4132214346515752 +Relative difference = 0.00043425681546129636 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.323721e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.340795e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.340795e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.724764e+02 +- 2.665343e+02 ) GeV^-2 -TOTAL : 4.954042 sec - 17,317,128,762 cycles # 3.495 GHz - 58,675,021,220 instructions # 3.39 insn per cycle - 4.956261990 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1024) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.676079e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.689805e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.689805e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 6.143350 sec + 18,995,848,931 cycles # 3.090 GHz + 58,700,265,502 instructions # 3.09 insn per cycle + 6.150073952 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1029) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412986e+00 Avg ME (F77/C++) = 1.4129858051842916 Relative difference = 1.3787518662898538e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.165888e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.187242e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.187242e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.724758e+02 +- 2.665339e+02 ) GeV^-2 -TOTAL : 1.428429 sec - 4,993,014,379 cycles # 3.492 GHz - 16,503,883,029 instructions # 3.31 insn per cycle - 1.430662597 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.180884e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.346917e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.346917e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.009236e+02 +- 5.002643e+01 ) GeV^-2 +TOTAL : 1.804269 sec + 5,584,642,506 cycles # 3.088 GHz + 16,510,962,038 instructions # 2.96 insn per cycle + 1.819572816 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5551) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.412987e+00 Avg ME (F77/C++) = 1.4129865669244737 Relative difference = 3.06496469061158e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.140223e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.212640e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.212640e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743722e+02 +- 2.676604e+02 ) GeV^-2 -TOTAL : 0.788410 sec - 2,610,891,292 cycles # 3.306 GHz - 6,621,195,509 instructions # 2.54 insn per cycle - 0.790692475 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.634306e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.685973e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.685973e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 1.022630 sec + 2,975,513,176 cycles # 2.898 GHz + 6,634,498,276 instructions # 2.23 insn per cycle + 1.034400565 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5568) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133161655815059 Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.287783e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.364522e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.364522e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743722e+02 +- 2.676604e+02 ) GeV^-2 -TOTAL : 0.739428 sec - 2,449,362,633 cycles # 3.306 GHz - 6,234,567,338 instructions # 2.55 insn per cycle - 0.741705444 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.769784e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.829611e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.829611e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008855e+02 +- 5.002467e+01 ) GeV^-2 +TOTAL : 0.945795 sec + 2,752,522,160 cycles # 2.898 GHz + 6,256,039,450 instructions # 2.27 insn per cycle + 0.961442115 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5279) (512y: 25) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133161655815059 Relative difference = 1.1715816267550621e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.476688e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.662734e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.662734e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.743730e+02 +- 2.676609e+02 ) GeV^-2 -TOTAL : 0.494659 sec - 1,638,048,070 cycles # 3.302 GHz - 3,675,411,384 instructions # 2.24 insn per cycle - 0.496969912 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.392018e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.430701e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.430701e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008856e+02 +- 5.002468e+01 ) GeV^-2 +TOTAL : 1.200320 sec + 2,230,572,619 cycles # 1.852 GHz + 3,698,329,997 instructions # 1.66 insn per cycle + 1.213663484 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2378) (512y: 29) (512z: 3963) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413316e+00 Avg ME (F77/C++) = 1.4133164033579249 Relative difference = 2.85398258307829e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt index d6556bb1be..568d6c4513 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_19:12:32 +DATE: 2024-03-01_02:31:34 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.426575e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.039569e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.055629e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.463709 sec + 2,071,639,040 cycles # 3.004 GHz + 2,941,031,538 instructions # 1.42 insn per cycle + 0.764842159 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.035948e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.309187e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.325703e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.608855 sec + 2,552,084,280 cycles # 3.004 GHz + 3,794,047,088 instructions # 1.49 insn per cycle + 0.909216297 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.413122e+00 +Avg ME (F77/CUDA) = 1.4131213755569487 +Relative difference = 4.418889885423659e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.149728e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.164397e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.164397e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.227650 sec - 18,278,883,667 cycles # 3.496 GHz - 60,529,183,661 instructions # 3.31 insn per cycle - 5.230024763 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1404) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.546543e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.558753e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.558753e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.456566 sec + 20,000,355,725 cycles # 3.096 GHz + 60,532,425,335 instructions # 3.03 insn per cycle + 6.462989015 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1399) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.312678e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.370148e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.370148e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.620058 sec - 9,163,068,711 cycles # 3.495 GHz - 30,372,553,579 instructions # 3.31 insn per cycle - 2.622366147 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.015629e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.062224e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.062224e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.288178 sec + 10,191,043,016 cycles # 3.096 GHz + 30,384,591,666 instructions # 2.98 insn per cycle + 3.302408299 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5280) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.306653e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.329616e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.329616e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.278379 sec - 4,229,813,296 cycles # 3.305 GHz - 10,963,305,485 instructions # 2.59 insn per cycle - 1.280804887 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.844182e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.002719e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.002719e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.686926 sec + 4,874,678,301 cycles # 2.883 GHz + 10,979,160,826 instructions # 2.25 insn per cycle + 1.698730583 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4624) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.415509e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.442464e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.442464e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.182291 sec - 3,912,618,342 cycles # 3.305 GHz - 10,225,163,393 instructions # 2.61 insn per cycle - 1.184557374 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4279) (512y: 82) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.132241e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.155783e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.155783e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.469271 sec + 4,278,421,569 cycles # 2.904 GHz + 10,248,685,624 instructions # 2.40 insn per cycle + 1.480280367 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4280) (512y: 82) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.830365e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.874980e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.874980e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.920395 sec - 3,040,685,755 cycles # 3.298 GHz - 6,018,479,747 instructions # 1.98 insn per cycle - 0.922695458 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.587751e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.694540e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.694540e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.183850 sec + 4,204,822,902 cycles # 1.923 GHz + 6,044,506,630 instructions # 1.44 insn per cycle + 2.192719745 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2066) (512y: 117) (512z: 3540) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213786174055 Relative difference = 4.3972324717191576e-07 diff --git a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt index 29df44b487..2001d2a062 100644 --- a/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttg_mad/log_ggttg_mad_m_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg' -DATE: 2024-03-01_19:12:48 +DATE: 2024-03-01_02:32:03 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.409979e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.033107e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.049247e+07 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 0.461655 sec + 2,079,301,655 cycles # 3.013 GHz + 2,945,288,445 instructions # 1.42 insn per cycle + 0.761228896 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.037338e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.304237e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.318241e+07 ) sec^-1 +MeanMatrixElemValue = ( 6.734461e+02 +- 4.775415e+02 ) GeV^-2 +TOTAL : 0.603998 sec + 2,550,056,991 cycles # 3.016 GHz + 3,770,712,997 instructions # 1.48 insn per cycle + 0.905342631 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.413122e+00 +Avg ME (F77/CUDA) = 1.4131213755569487 +Relative difference = 4.418889885423659e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.185651e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.200819e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.200819e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 5.168749 sec - 18,081,178,947 cycles # 3.497 GHz - 59,877,727,481 instructions # 3.31 insn per cycle - 5.171016307 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 1262) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.536387e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.548597e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.548597e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 6.482109 sec + 19,897,203,281 cycles # 3.068 GHz + 59,934,079,759 instructions # 3.01 insn per cycle + 6.488470935 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 1276) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213859069593 Relative difference = 4.345647726386255e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.319458e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.377020e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.377020e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 2.617395 sec - 9,156,723,314 cycles # 3.496 GHz - 30,085,782,756 instructions # 3.29 insn per cycle - 2.619742420 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.079933e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.127366e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.127366e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 3.246582 sec + 10,068,513,741 cycles # 3.097 GHz + 30,097,905,174 instructions # 2.99 insn per cycle + 3.264343936 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 5082) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213792564823 Relative difference = 4.392710025734405e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.247849e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.268698e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.268698e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.337352 sec - 4,424,282,737 cycles # 3.304 GHz - 11,463,474,174 instructions # 2.59 insn per cycle - 1.339740203 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4717) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.599229e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.768469e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.768469e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.728964 sec + 5,016,079,762 cycles # 2.895 GHz + 11,483,054,886 instructions # 2.29 insn per cycle + 1.742427809 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4723) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.341509e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.365682e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.365682e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 1.246200 sec - 4,123,785,215 cycles # 3.305 GHz - 10,787,122,633 instructions # 2.62 insn per cycle - 1.248520950 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.051243e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.071758e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.071758e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 1.580395 sec + 4,590,869,899 cycles # 2.898 GHz + 10,811,034,467 instructions # 2.35 insn per cycle + 1.596114627 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4285) (512y: 234) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213600217192 Relative difference = 4.5288254008796884e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.838145e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.883823e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.883823e+05 ) sec^-1 -MeanMatrixElemValue = ( 4.740115e+02 +- 2.671575e+02 ) GeV^-2 -TOTAL : 0.916566 sec - 3,029,597,137 cycles # 3.300 GHz - 6,246,692,057 instructions # 2.06 insn per cycle - 0.918942029 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.586932e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.694563e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.694563e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.008920e+02 +- 5.001681e+01 ) GeV^-2 +TOTAL : 2.184061 sec + 4,216,157,602 cycles # 1.927 GHz + 6,273,944,868 instructions # 1.49 insn per cycle + 2.195028764 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1961) (512y: 163) (512z: 3617) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttg.mad/SubProcesses/P1_gg_ttxg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.413122e+00 Avg ME (F77/C++) = 1.4131213786174055 Relative difference = 4.3972324717191576e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt index a3ce2c07d0..c4f627d4b9 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:13:03 +DATE: 2024-03-01_02:32:32 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.456101e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.489020e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.491439e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.526891 sec + 2,312,216,646 cycles # 3.007 GHz + 3,538,385,257 instructions # 1.53 insn per cycle + 0.841955777 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.122556e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.158071e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.159487e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.037875 sec + 10,086,152,870 cycles # 3.059 GHz + 22,511,661,776 instructions # 2.23 insn per cycle + 3.352868148 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.481783e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.483017e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.483017e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.616014 sec - 23,134,926,646 cycles # 3.496 GHz - 78,769,826,090 instructions # 3.40 insn per cycle - 6.618301143 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.962967e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.963888e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.963888e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.365178 sec + 25,629,682,297 cycles # 3.063 GHz + 78,935,463,104 instructions # 3.08 insn per cycle + 8.371779038 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.886296e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.890965e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.890965e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.363857 sec - 11,665,465,951 cycles # 3.466 GHz - 39,273,332,129 instructions # 3.37 insn per cycle - 3.366166984 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.775994e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.779313e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.779313e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.352554 sec + 12,920,825,541 cycles # 2.966 GHz + 39,280,019,197 instructions # 3.04 insn per cycle + 4.370436126 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.126568e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.128873e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.128873e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.462899 sec - 4,833,071,970 cycles # 3.300 GHz - 13,680,344,924 instructions # 2.83 insn per cycle - 1.465169066 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.587371e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.605210e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.605210e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.920439 sec + 5,577,220,412 cycles # 2.899 GHz + 13,686,699,383 instructions # 2.45 insn per cycle + 1.933532640 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.280240e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.283273e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.283273e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.288270 sec - 4,256,880,431 cycles # 3.300 GHz - 12,334,305,617 instructions # 2.90 insn per cycle - 1.290601255 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.660129e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.682450e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.682450e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.708010 sec + 4,898,677,790 cycles # 2.863 GHz + 12,341,670,637 instructions # 2.52 insn per cycle + 1.722166284 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.023870e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.031547e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.031547e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.817740 sec - 2,700,603,798 cycles # 3.296 GHz - 6,324,447,395 instructions # 2.34 insn per cycle - 0.820100253 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.531084e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.544719e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.544719e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.187284 sec + 4,109,191,778 cycles # 1.875 GHz + 6,335,550,253 instructions # 1.54 insn per cycle + 2.200752564 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt index 1d56a4470b..8d1778e673 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_bridge.txt @@ -1,188 +1,237 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:30:51 +DATE: 2024-03-01_03:16:28 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.142985e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.469804e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.469804e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.511155 sec + 2,228,194,908 cycles # 3.016 GHz + 3,541,287,827 instructions # 1.59 insn per cycle + 0.799045956 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.621948e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.093950e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.093950e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.305480 sec + 10,998,775,521 cycles # 3.077 GHz + 24,493,841,360 instructions # 2.23 insn per cycle + 3.633710964 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.481958e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.483200e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.483200e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.617892 sec - 23,143,239,754 cycles # 3.496 GHz - 78,775,014,566 instructions # 3.40 insn per cycle - 6.620381527 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.956691e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.957671e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.957671e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.395628 sec + 25,661,453,890 cycles # 3.059 GHz + 78,946,626,848 instructions # 3.08 insn per cycle + 8.400144517 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.816064e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.820578e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.820578e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.415562 sec - 11,674,359,157 cycles # 3.416 GHz - 39,286,928,254 instructions # 3.37 insn per cycle - 3.418065939 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.779486e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.783121e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.783121e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.352704 sec + 12,939,532,043 cycles # 2.970 GHz + 39,292,271,047 instructions # 3.04 insn per cycle + 4.357352756 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.129360e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.131693e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.131693e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.461769 sec - 4,829,397,024 cycles # 3.300 GHz - 13,689,355,371 instructions # 2.83 insn per cycle - 1.464289743 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.560149e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.578951e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.578951e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.929060 sec + 5,589,750,479 cycles # 2.892 GHz + 13,696,577,373 instructions # 2.45 insn per cycle + 1.933630865 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.278771e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.281807e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.281807e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.292785 sec - 4,271,370,436 cycles # 3.300 GHz - 12,345,376,003 instructions # 2.89 insn per cycle - 1.295310523 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.749338e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.772565e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.772565e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.695619 sec + 4,910,055,408 cycles # 2.889 GHz + 12,351,492,799 instructions # 2.52 insn per cycle + 1.700097015 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.021965e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.029524e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.029524e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.821349 sec - 2,711,016,004 cycles # 3.294 GHz - 6,335,916,291 instructions # 2.34 insn per cycle - 0.823888580 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.621116e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.636094e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.636094e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.165843 sec + 4,123,850,554 cycles # 1.901 GHz + 6,345,407,560 instructions # 1.54 insn per cycle + 2.170297070 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt index dc8d1365ef..597fd5665a 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_common.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:35:49 +DATE: 2024-03-01_03:28:08 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.502974e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.532224e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.534544e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 +TOTAL : 0.505991 sec + 2,242,092,583 cycles # 3.014 GHz + 3,466,791,908 instructions # 1.55 insn per cycle + 0.811853126 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.137461e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.171030e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.172456e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.252232e+02 +- 1.234346e+02 ) GeV^-4 +TOTAL : 3.124130 sec + 10,356,034,147 cycles # 3.069 GHz + 23,417,816,833 instructions # 2.26 insn per cycle + 3.433693053 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.481844e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.483078e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.483078e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.957351e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.958278e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.958278e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.615618 sec - 23,132,699,978 cycles # 3.496 GHz - 78,769,037,054 instructions # 3.41 insn per cycle - 6.617829059 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 8.389537 sec + 25,646,805,438 cycles # 3.056 GHz + 78,935,262,340 instructions # 3.08 insn per cycle + 8.393631651 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.815456e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.819978e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.819978e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.762997e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.766514e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.766514e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.413181 sec - 11,662,452,031 cycles # 3.415 GHz - 39,273,293,393 instructions # 3.37 insn per cycle - 3.415398215 seconds time elapsed +TOTAL : 4.369422 sec + 12,916,153,129 cycles # 2.954 GHz + 39,278,867,860 instructions # 3.04 insn per cycle + 4.373667823 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.130382e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.132719e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.132719e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.528032e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.546362e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.546362e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.458091 sec - 4,816,775,693 cycles # 3.300 GHz - 13,680,360,383 instructions # 2.84 insn per cycle - 1.460366647 seconds time elapsed +TOTAL : 1.933878 sec + 5,580,678,683 cycles # 2.881 GHz + 13,684,529,284 instructions # 2.45 insn per cycle + 1.937965494 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.278483e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.281512e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.281512e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.723484e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.746463e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.746463e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.290121 sec - 4,262,527,735 cycles # 3.300 GHz - 12,334,714,107 instructions # 2.89 insn per cycle - 1.292329143 seconds time elapsed +TOTAL : 1.697628 sec + 4,903,453,092 cycles # 2.882 GHz + 12,338,806,795 instructions # 2.52 insn per cycle + 1.701856837 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.025121e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.032750e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.032750e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.314965e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.328200e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.328200e+03 ) sec^-1 MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.817158 sec - 2,696,851,480 cycles # 3.294 GHz - 6,324,641,682 instructions # 2.35 insn per cycle - 0.819432272 seconds time elapsed +TOTAL : 2.253262 sec + 4,111,107,725 cycles # 1.822 GHz + 6,332,329,650 instructions # 1.54 insn per cycle + 2.257544828 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt index 93293d216b..60e01cd2dd 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_curhst.txt @@ -1,138 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:34:23 +DATE: 2024-03-01_03:24:50 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.510827e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.539312e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.541615e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.503655 sec + 2,239,000,994 cycles # 3.024 GHz + 3,553,306,239 instructions # 1.59 insn per cycle + 0.813367897 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.145153e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.179407e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.180837e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.061657 sec + 10,188,245,124 cycles # 3.074 GHz + 23,248,414,020 instructions # 2.28 insn per cycle + 3.370951944 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe: Aborted - 4,606,961 cycles # 3.257 GHz - 6,302,943 instructions # 1.37 insn per cycle - 0.037961845 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.938252e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.939166e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.939166e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.471636 sec + 25,650,928,170 cycles # 3.027 GHz + 78,935,761,644 instructions # 3.08 insn per cycle + 8.475777896 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe: Aborted - 4,632,503 cycles # 2.696 GHz - 6,332,916 instructions # 1.37 insn per cycle - 0.038722425 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.732481e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.735838e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.735838e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.403548 sec + 12,924,361,173 cycles # 2.933 GHz + 39,279,334,894 instructions # 3.04 insn per cycle + 4.407811208 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe: Aborted - 4,502,042 cycles # 2.684 GHz - 6,323,077 instructions # 1.40 insn per cycle - 0.038401230 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.485088e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.502714e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.502714e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.942219 sec + 5,571,920,631 cycles # 2.864 GHz + 13,685,480,241 instructions # 2.46 insn per cycle + 1.946449782 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe: Aborted - 4,898,492 cycles # 3.254 GHz - 6,342,668 instructions # 1.29 insn per cycle - 0.037656802 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.737761e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.761950e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.761950e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.693500 sec + 4,894,918,115 cycles # 2.884 GHz + 12,340,665,409 instructions # 2.52 insn per cycle + 1.697702233 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe: Aborted - 4,865,885 cycles # 3.245 GHz - 6,331,116 instructions # 1.30 insn per cycle - 0.038706032 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.532631e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.547658e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.547658e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.186925 sec + 4,105,530,431 cycles # 1.874 GHz + 6,333,977,995 instructions # 1.54 insn per cycle + 2.191453097 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt index f7105fde21..de32359ede 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd0_rmbhst.txt @@ -1,183 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:33:23 +DATE: 2024-03-01_03:21:36 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.198300e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.499375e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.501597e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.508517 sec + 2,246,531,629 cycles # 3.011 GHz + 3,559,465,442 instructions # 1.58 insn per cycle + 0.806328345 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.741268e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.175443e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.176848e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.195111 sec + 10,565,694,760 cycles # 3.061 GHz + 24,272,327,456 instructions # 2.30 insn per cycle + 3.508790742 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.481824e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.483072e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.483072e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.615817 sec - 23,135,149,155 cycles # 3.496 GHz - 78,769,523,152 instructions # 3.40 insn per cycle - 6.618024105 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4856) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.950947e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.951893e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.951893e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.415718 sec + 25,630,796,247 cycles # 3.044 GHz + 78,935,144,677 instructions # 3.08 insn per cycle + 8.419920398 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4893) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.822215e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.826748e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.826748e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.408335 sec - 11,661,748,107 cycles # 3.420 GHz - 39,273,222,931 instructions # 3.37 insn per cycle - 3.410571683 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.749651e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.752979e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.752979e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.383944 sec + 12,941,364,841 cycles # 2.950 GHz + 39,279,009,350 instructions # 3.04 insn per cycle + 4.388336169 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13184) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.127496e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.129817e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.129817e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.461766 sec - 4,828,963,686 cycles # 3.300 GHz - 13,679,994,736 instructions # 2.83 insn per cycle - 1.464026212 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.444820e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.462277e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.462277e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.951803 sec + 5,576,482,664 cycles # 2.852 GHz + 13,685,505,947 instructions # 2.45 insn per cycle + 1.956019187 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11357) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.278160e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.281173e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.281173e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.290276 sec - 4,260,419,952 cycles # 3.298 GHz - 12,334,650,697 instructions # 2.90 insn per cycle - 1.292701283 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.751887e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.775334e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.775334e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.690955 sec + 4,892,330,509 cycles # 2.888 GHz + 12,340,572,549 instructions # 2.52 insn per cycle + 1.695111197 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10266) (512y: 88) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.023476e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.031069e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.031069e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.817764 sec - 2,700,756,236 cycles # 3.297 GHz - 6,324,382,140 instructions # 2.34 insn per cycle - 0.819998960 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.643060e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.657306e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.657306e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.155567 sec + 4,105,793,778 cycles # 1.902 GHz + 6,333,858,387 instructions # 1.54 insn per cycle + 2.159935327 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1821) (512y: 102) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt index cad62f799d..836b2fd223 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:13:22 +DATE: 2024-03-01_02:33:08 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.456815e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.489621e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.492178e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.523446 sec + 2,259,779,898 cycles # 2.994 GHz + 3,514,783,609 instructions # 1.56 insn per cycle + 0.830655921 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.127813e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.161921e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.163304e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.027147 sec + 10,102,095,677 cycles # 3.066 GHz + 22,774,733,235 instructions # 2.25 insn per cycle + 3.352533111 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266731198158133E-004 +Relative difference = 2.837296512218831e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.503316e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.504578e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.504578e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.559386 sec - 22,937,995,811 cycles # 3.496 GHz - 78,503,071,109 instructions # 3.42 insn per cycle - 6.561632048 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4246) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.968945e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.969930e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.969930e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.342362 sec + 25,562,894,530 cycles # 3.064 GHz + 78,707,498,900 instructions # 3.08 insn per cycle + 8.350709191 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4264) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141133E-004 Relative difference = 2.8372990776517314e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.721965e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.726341e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.726341e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.480511 sec - 11,746,156,846 cycles # 3.373 GHz - 39,219,964,596 instructions # 3.34 insn per cycle - 3.482689146 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.758058e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.761397e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.761397e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.374701 sec + 12,919,245,066 cycles # 2.951 GHz + 39,226,355,054 instructions # 3.04 insn per cycle + 4.387657418 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12951) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141122E-004 Relative difference = 2.837299079287849e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.134275e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.136627e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.136627e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.452965 sec - 4,799,994,769 cycles # 3.300 GHz - 13,794,367,131 instructions # 2.87 insn per cycle - 1.455293281 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.289947e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.307265e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.307265e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.987975 sec + 5,629,143,308 cycles # 2.825 GHz + 13,800,788,871 instructions # 2.45 insn per cycle + 1.999251955 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11422) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.263188e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.266183e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.266183e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.305376 sec - 4,313,294,929 cycles # 3.300 GHz - 12,459,449,195 instructions # 2.89 insn per cycle - 1.307767380 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.607973e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.629961e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.629961e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.716692 sec + 4,942,228,477 cycles # 2.873 GHz + 12,466,581,724 instructions # 2.52 insn per cycle + 1.728222884 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10258) (512y: 240) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.017201e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.024979e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.024979e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.820329 sec - 2,710,241,576 cycles # 3.297 GHz - 6,448,202,008 instructions # 2.38 insn per cycle - 0.822690390 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.633414e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.646913e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.646913e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.159145 sec + 4,117,977,410 cycles # 1.904 GHz + 6,458,802,297 instructions # 1.57 insn per cycle + 2.172057894 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1647) (512y: 192) (512z: 9375) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198157320E-004 Relative difference = 2.837296634927675e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt index 70731b668b..5cb26f1dc5 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:24:37 +DATE: 2024-03-01_03:05:40 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.234238e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.262824e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.264818e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.529504 sec + 2,311,611,520 cycles # 3.006 GHz + 3,548,053,349 instructions # 1.53 insn per cycle + 0.826491750 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.771596e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.800183e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.801376e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.298192 sec + 10,832,117,508 cycles # 3.051 GHz + 23,123,371,744 instructions # 2.13 insn per cycle + 3.609870208 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.040051e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.040564e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.040564e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 32.548023 sec - 113,830,166,208 cycles # 3.497 GHz - 143,883,071,750 instructions # 1.26 insn per cycle - 32.550633058 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:21053) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.437828e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.438319e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.438319e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 36.966049 sec + 113,615,073,618 cycles # 3.074 GHz + 144,968,095,911 instructions # 1.28 insn per cycle + 36.970400514 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:21301) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140450E-004 Relative difference = 2.83729918072716e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.954257e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.957301e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.957301e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.155080 sec - 14,533,662,034 cycles # 3.496 GHz - 37,569,403,247 instructions # 2.58 insn per cycle - 4.157544496 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.281454e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.284254e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.284254e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 5.007790 sec + 14,730,075,423 cycles # 2.939 GHz + 37,574,123,368 instructions # 2.55 insn per cycle + 5.012256986 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68119) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141209E-004 Relative difference = 2.8372990661989057e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.045261e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.060223e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.060223e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.820401 sec - 6,012,541,840 cycles # 3.300 GHz - 13,057,135,841 instructions # 2.17 insn per cycle - 1.822679574 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.743950e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.758262e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.758262e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.127650 sec + 6,163,100,705 cycles # 2.892 GHz + 13,061,449,928 instructions # 2.12 insn per cycle + 2.132187716 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:46960) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.099476e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.101705e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.101705e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.499045 sec - 4,952,169,341 cycles # 3.300 GHz - 11,434,777,878 instructions # 2.31 insn per cycle - 1.501339023 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.460039e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.482215e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.482215e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.743142 sec + 5,059,957,423 cycles # 2.897 GHz + 11,440,000,239 instructions # 2.26 insn per cycle + 1.747501406 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:40434) (512y: 285) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.995108e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.002501e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.002501e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.829410 sec - 2,740,060,865 cycles # 3.297 GHz - 5,933,015,109 instructions # 2.17 insn per cycle - 0.831788683 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.938377e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.953416e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.953416e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.075865 sec + 3,979,244,183 cycles # 1.914 GHz + 5,942,139,795 instructions # 1.49 insn per cycle + 2.080305520 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2455) (512y: 337) (512z:39411) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt index 37ef5ca493..afca4b7953 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_d_inl1_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_d_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:25:24 +DATE: 2024-03-01_03:06:48 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.244633e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.273686e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.275983e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.531287 sec + 2,311,991,159 cycles # 3.015 GHz + 3,584,221,599 instructions # 1.55 insn per cycle + 0.825938734 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.793538e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.821908e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.823116e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.269849 sec + 10,805,743,512 cycles # 3.068 GHz + 25,084,175,459 instructions # 2.32 insn per cycle + 3.579404730 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266731198158122E-004 +Relative difference = 2.837296513854949e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.995413e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.995921e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.995921e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 32.838703 sec - 114,844,431,122 cycles # 3.497 GHz - 144,446,965,492 instructions # 1.26 insn per cycle - 32.841012912 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:22369) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.412070e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.412546e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.412546e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 37.253529 sec + 114,121,742,420 cycles # 3.069 GHz + 145,689,073,244 instructions # 1.28 insn per cycle + 37.257693750 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:22559) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198140450E-004 Relative difference = 2.83729918072716e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.843711e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.846578e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.846578e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 4.274359 sec - 14,949,342,714 cycles # 3.496 GHz - 37,756,081,643 instructions # 2.53 insn per cycle - 4.276658422 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.198627e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.201180e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.201180e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 5.136766 sec + 15,152,451,249 cycles # 2.948 GHz + 37,761,291,325 instructions # 2.49 insn per cycle + 5.141156615 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:68447) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198141209E-004 Relative difference = 2.8372990661989057e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.243220e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.258795e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.258795e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.781599 sec - 5,884,681,951 cycles # 3.300 GHz - 12,892,638,092 instructions # 2.19 insn per cycle - 1.783976623 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.950126e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.965335e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.965335e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.072422 sec + 6,013,210,013 cycles # 2.896 GHz + 12,895,807,400 instructions # 2.14 insn per cycle + 2.076740513 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:45929) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.089187e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.091372e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.091372e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.513045 sec - 4,998,315,786 cycles # 3.300 GHz - 11,441,056,031 instructions # 2.29 insn per cycle - 1.515430361 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.394633e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.416357e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.416357e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.755119 sec + 5,091,337,522 cycles # 2.895 GHz + 11,446,622,503 instructions # 2.25 insn per cycle + 1.759562583 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:40123) (512y: 219) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.009625e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.017155e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.017155e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.823716 sec - 2,720,763,680 cycles # 3.298 GHz - 5,887,165,290 instructions # 2.16 insn per cycle - 0.826091686 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.001850e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.017431e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.017431e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.059473 sec + 3,944,538,203 cycles # 1.912 GHz + 5,896,184,476 instructions # 1.49 insn per cycle + 2.063940696 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1971) (512y: 259) (512z:38937) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_d_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731198156789E-004 Relative difference = 2.837296715097453e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt index d61f6be5ab..082176c355 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:13:41 +DATE: 2024-03-01_02:33:45 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.331619e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.392833e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.401451e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.481440 sec + 2,077,514,231 cycles # 2.979 GHz + 3,093,505,744 instructions # 1.49 insn per cycle + 0.777796663 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.622317e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.697439e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.700567e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.713365 sec + 5,944,272,538 cycles # 3.053 GHz + 12,632,277,461 instructions # 2.13 insn per cycle + 2.004079656 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.541023e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.542272e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.542272e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.472852 sec - 22,594,942,433 cycles # 3.496 GHz - 78,109,438,979 instructions # 3.46 insn per cycle - 6.475034460 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3570) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.049682e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.050694e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.050694e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.010109 sec + 24,614,432,061 cycles # 3.072 GHz + 78,126,558,251 instructions # 3.17 insn per cycle + 8.016891762 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274863266294753E-004 Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.868488e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.888089e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.888089e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 1.680412 sec - 5,721,227,554 cycles # 3.426 GHz - 20,115,812,778 instructions # 3.52 insn per cycle - 1.682695212 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.386833e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.400650e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.400650e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.228676 sec + 6,461,822,382 cycles # 2.894 GHz + 20,120,855,558 instructions # 3.11 insn per cycle + 2.241648353 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861460025036E-004 Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.195033e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.204366e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.204366e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.753674 sec - 2,492,228,979 cycles # 3.300 GHz - 6,983,768,704 instructions # 2.80 insn per cycle - 0.755942549 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.671811e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.678370e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.678370e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.990019 sec + 2,821,251,649 cycles # 2.839 GHz + 6,989,221,748 instructions # 2.48 insn per cycle + 1.002444816 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.521876e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.534107e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.534107e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.656748 sec - 2,172,255,103 cycles # 3.300 GHz - 6,290,065,451 instructions # 2.90 insn per cycle - 0.658959491 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.922237e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.931217e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.931217e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.861179 sec + 2,488,986,957 cycles # 2.876 GHz + 6,296,476,670 instructions # 2.53 insn per cycle + 0.887481911 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.029368e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.061175e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.061175e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.413448 sec - 1,368,272,372 cycles # 3.298 GHz - 3,258,833,410 instructions # 2.38 insn per cycle - 0.415626098 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.534197e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.539839e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.539839e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.078476 sec + 2,048,809,794 cycles # 1.894 GHz + 3,266,667,713 instructions # 1.59 insn per cycle + 1.091634951 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779373838E-004 Relative difference = 4.193891735414155e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt index 4bdc113ec5..6f564b583c 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_bridge.txt @@ -1,188 +1,237 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:31:10 +DATE: 2024-03-01_03:17:05 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.665443e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.315182e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.315182e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 +TOTAL : 0.468201 sec + 2,060,292,715 cycles # 2.983 GHz + 3,094,906,819 instructions # 1.50 insn per cycle + 0.750075013 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.249943e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.466015e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.466015e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 +TOTAL : 1.882218 sec + 6,478,461,444 cycles # 3.059 GHz + 12,879,929,349 instructions # 1.99 insn per cycle + 2.174649918 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.540494e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.541741e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.541741e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.463976 sec - 22,603,420,847 cycles # 3.496 GHz - 78,113,326,401 instructions # 3.46 insn per cycle - 6.466307048 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3570) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.041429e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.042536e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.042536e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.044775 sec + 24,623,818,516 cycles # 3.060 GHz + 78,132,484,739 instructions # 3.17 insn per cycle + 8.049291657 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274863266294753E-004 Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.972987e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.992778e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.992778e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 1.652838 sec - 5,725,272,561 cycles # 3.461 GHz - 20,124,950,762 instructions # 3.52 insn per cycle - 1.655209946 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.498892e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.513186e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.513186e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.197009 sec + 6,464,288,620 cycles # 2.938 GHz + 20,129,426,624 instructions # 3.11 insn per cycle + 2.201352169 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861460025036E-004 Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.251048e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.260877e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.260877e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.736738 sec - 2,436,577,597 cycles # 3.300 GHz - 6,993,293,212 instructions # 2.87 insn per cycle - 0.739071334 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.703352e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.711063e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.711063e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.973161 sec + 2,827,392,405 cycles # 2.894 GHz + 6,998,075,079 instructions # 2.48 insn per cycle + 0.977561277 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.520306e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.532523e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.532523e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.658909 sec - 2,179,756,886 cycles # 3.300 GHz - 6,299,575,077 instructions # 2.89 insn per cycle - 0.661241811 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.931885e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.940835e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.940835e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.859317 sec + 2,491,742,914 cycles # 2.887 GHz + 6,305,390,293 instructions # 2.53 insn per cycle + 0.863665296 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.103354e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.136311e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.136311e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.408039 sec - 1,350,732,763 cycles # 3.299 GHz - 3,269,590,389 instructions # 2.42 insn per cycle - 0.410460189 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.551095e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.557002e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.557002e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.067932 sec + 2,057,227,059 cycles # 1.920 GHz + 3,276,345,738 instructions # 1.59 insn per cycle + 1.072312021 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779373838E-004 Relative difference = 4.193891735414155e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt index d77de8d4f6..66226e8d59 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_common.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:36:07 +DATE: 2024-03-01_03:28:45 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.308056e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.358553e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.363626e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.159397e-01 +- 3.238804e-01 ) GeV^-4 +TOTAL : 0.461299 sec + 2,006,885,691 cycles # 2.992 GHz + 3,022,532,155 instructions # 1.51 insn per cycle + 0.728549346 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --common +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --common OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:COMMON+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.572531e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.646089e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.649338e+05 ) sec^-1 +MeanMatrixElemValue = ( 1.094367e+02 +- 1.071509e+02 ) GeV^-4 +TOTAL : 1.795584 sec + 6,148,728,410 cycles # 3.042 GHz + 12,326,233,623 instructions # 2.00 insn per cycle + 2.078967785 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.539698e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.540976e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.540976e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.053824e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.054841e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.054841e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.464734 sec - 22,602,962,065 cycles # 3.496 GHz - 78,108,873,476 instructions # 3.46 insn per cycle - 6.466917782 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3570) (avx2: 0) (512y: 0) (512z: 0) +TOTAL : 7.994149 sec + 24,620,138,866 cycles # 3.079 GHz + 78,125,377,108 instructions # 3.17 insn per cycle + 7.998228624 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274863266294753E-004 Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.925627e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.945236e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.945236e+03 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.346279e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.360483e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.360483e+03 ) sec^-1 MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 1.658772 sec - 5,725,250,336 cycles # 3.449 GHz - 20,115,356,658 instructions # 3.51 insn per cycle - 1.660966677 seconds time elapsed +TOTAL : 2.242069 sec + 6,461,640,731 cycles # 2.878 GHz + 20,121,052,869 instructions # 3.11 insn per cycle + 2.246196034 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861460025036E-004 Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.251477e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.261264e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.261264e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.685316e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.692321e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.692321e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.734787 sec - 2,429,802,465 cycles # 3.300 GHz - 6,983,714,880 instructions # 2.87 insn per cycle - 0.736976159 seconds time elapsed +TOTAL : 0.982986 sec + 2,822,415,829 cycles # 2.862 GHz + 6,987,486,660 instructions # 2.48 insn per cycle + 0.987025186 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.521461e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.533742e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.533742e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.936405e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.945906e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.945906e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.656838 sec - 2,172,470,830 cycles # 3.300 GHz - 6,289,863,881 instructions # 2.90 insn per cycle - 0.658982044 seconds time elapsed +TOTAL : 0.855808 sec + 2,484,894,865 cycles # 2.892 GHz + 6,291,816,709 instructions # 2.53 insn per cycle + 0.859867773 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --common OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.110621e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.143899e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.143899e+04 ) sec^-1 +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.547512e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.553394e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.553394e+04 ) sec^-1 MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.405468 sec - 1,341,887,518 cycles # 3.298 GHz - 3,258,430,365 instructions # 2.43 insn per cycle - 0.407620788 seconds time elapsed +TOTAL : 1.069890 sec + 2,051,026,977 cycles # 1.912 GHz + 3,263,937,559 instructions # 1.59 insn per cycle + 1.073863100 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779373838E-004 Relative difference = 4.193891735414155e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt index 06ac489be0..e810053300 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_curhst.txt @@ -1,138 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:34:29 +DATE: 2024-03-01_03:25:26 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.337764e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.388253e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.393743e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.460965 sec + 2,014,485,763 cycles # 3.003 GHz + 3,009,625,577 instructions # 1.49 insn per cycle + 0.728425666 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --curhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --curhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.558734e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.632343e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.635567e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.743753 sec + 6,041,672,737 cycles # 3.067 GHz + 12,221,124,809 instructions # 2.02 insn per cycle + 2.027112098 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe: Aborted - 4,588,745 cycles # 3.254 GHz - 6,297,711 instructions # 1.37 insn per cycle - 0.038493069 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3570) (avx2: 0) (512y: 0) (512z: 0) +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.040104e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.041097e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.041097e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.046597 sec + 24,613,022,395 cycles # 3.060 GHz + 78,130,326,722 instructions # 3.17 insn per cycle + 8.050808561 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274863266294753E-004 Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe: Aborted - 4,649,873 cycles # 3.255 GHz - 6,330,030 instructions # 1.36 insn per cycle - 0.038267941 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.468090e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.482424e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.482424e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.203809 sec + 6,456,229,713 cycles # 2.925 GHz + 20,119,923,968 instructions # 3.12 insn per cycle + 2.207913022 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861460025036E-004 Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe: Aborted - 4,641,444 cycles # 3.257 GHz - 6,326,220 instructions # 1.36 insn per cycle - 0.037581414 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.712278e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.719631e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.719631e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.965705 sec + 2,817,996,939 cycles # 2.908 GHz + 6,988,025,639 instructions # 2.48 insn per cycle + 0.969794950 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe: Aborted - 4,553,782 cycles # 3.259 GHz - 6,327,532 instructions # 1.39 insn per cycle - 0.038327048 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.924856e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.934354e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.934354e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.859873 sec + 2,483,822,785 cycles # 2.877 GHz + 6,295,526,273 instructions # 2.53 insn per cycle + 0.863979329 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --curhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe: Aborted - 4,589,693 cycles # 3.232 GHz - 6,342,822 instructions # 1.38 insn per cycle - 0.039622291 seconds time elapsed +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.552387e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.558368e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.558368e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.064425 sec + 2,047,040,960 cycles # 1.917 GHz + 3,265,583,381 instructions # 1.60 insn per cycle + 1.068371519 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779373838E-004 Relative difference = 4.193891735414155e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt index 597c41c4c4..29def3747b 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd0_rmbhst.txt @@ -1,183 +1,223 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:33:42 +DATE: 2024-03-01_03:22:13 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.727516e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.381665e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.387640e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.048178e+00 +- 2.364571e+00 ) GeV^-4 +TOTAL : 0.463988 sec + 2,009,660,419 cycles # 2.987 GHz + 3,043,780,102 instructions # 1.51 insn per cycle + 0.732052318 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --rmbhst +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --rmbhst OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.463642e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.641012e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.644220e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.641710e+00 +- 4.994249e+00 ) GeV^-4 +TOTAL : 1.829361 sec + 6,179,090,687 cycles # 3.005 GHz + 13,497,023,724 instructions # 2.18 insn per cycle + 2.119489112 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.541255e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.542497e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.542497e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.460363 sec - 22,594,860,702 cycles # 3.497 GHz - 78,108,705,130 instructions # 3.46 insn per cycle - 6.462497444 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3570) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.033662e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.034665e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.034665e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.072340 sec + 24,646,233,583 cycles # 3.055 GHz + 78,130,465,005 instructions # 3.17 insn per cycle + 8.076398723 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3603) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274863266294753E-004 Relative difference = 4.92840687132121e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.980894e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.000089e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.000089e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 1.649650 sec - 5,715,455,769 cycles # 3.462 GHz - 20,116,970,711 instructions # 3.52 insn per cycle - 1.651831111 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.437406e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.451013e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.451013e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.213064 sec + 6,463,144,308 cycles # 2.916 GHz + 20,121,040,605 instructions # 3.11 insn per cycle + 2.217197026 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13763) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861460025036E-004 Relative difference = 2.2029847170826283e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.224266e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.233876e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.233876e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.743696 sec - 2,459,110,588 cycles # 3.300 GHz - 6,984,047,266 instructions # 2.84 insn per cycle - 0.745892046 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.690865e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.698060e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.698060e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.977816 sec + 2,816,932,981 cycles # 2.871 GHz + 6,987,870,279 instructions # 2.48 insn per cycle + 0.981891147 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11874) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.517139e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.529391e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.529391e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.657921 sec - 2,175,611,006 cycles # 3.300 GHz - 6,290,403,371 instructions # 2.89 insn per cycle - 0.660110597 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.925443e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.934689e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.934689e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.859893 sec + 2,483,713,955 cycles # 2.877 GHz + 6,295,351,555 instructions # 2.53 insn per cycle + 0.863911879 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10822) (512y: 43) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271938174574524E-004 Relative difference = 2.7544470208782633e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe -p 64 256 1 --rmbhst OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.107456e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.140872e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.140872e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.405676 sec - 1,342,942,249 cycles # 3.298 GHz - 3,258,636,901 instructions # 2.43 insn per cycle - 0.407877226 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.552325e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.558086e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.558086e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.064299 sec + 2,046,605,748 cycles # 1.917 GHz + 3,265,707,472 instructions # 1.60 insn per cycle + 1.068273671 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2414) (512y: 46) (512z: 9571) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952779373838E-004 Relative difference = 4.193891735414155e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt index 2ad237dd1a..50b444080d 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:13:56 +DATE: 2024-03-01_02:34:14 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.321381e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.374979e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.380502e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.482038 sec + 2,083,496,491 cycles # 2.987 GHz + 3,090,021,729 instructions # 1.48 insn per cycle + 0.780369869 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 8.505248e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.577137e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.580211e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.719742 sec + 5,952,430,615 cycles # 3.047 GHz + 11,750,571,480 instructions # 1.97 insn per cycle + 2.009992190 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262659968156085E-004 +Relative difference = 2.8371612387547027e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.559405e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.560665e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.560665e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208459e-01 +- 3.253446e-01 ) GeV^-4 -TOTAL : 6.414841 sec - 22,433,654,429 cycles # 3.497 GHz - 77,753,050,419 instructions # 3.47 insn per cycle - 6.417041697 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 3125) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.039243e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.040268e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.040268e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060121e+00 +- 2.367902e+00 ) GeV^-4 +TOTAL : 8.050624 sec + 24,577,706,132 cycles # 3.054 GHz + 77,857,469,800 instructions # 3.17 insn per cycle + 8.057072902 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 3114) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 -Avg ME (C++/C++) = 6.627486e-04 -Avg ME (F77/C++) = 6.6274863364631370E-004 -Relative difference = 5.076783822441729e-08 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +Avg ME (C++/C++) = 6.627487e-04 +Avg ME (F77/C++) = 6.6274866268634797E-004 +Relative difference = 5.630135835748959e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.823157e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.842320e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.842320e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.208457e-01 +- 3.253445e-01 ) GeV^-4 -TOTAL : 1.676003 sec - 5,656,803,299 cycles # 3.372 GHz - 20,080,771,686 instructions # 3.55 insn per cycle - 1.678252022 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.236562e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.248995e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.248995e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.060119e+00 +- 2.367901e+00 ) GeV^-4 +TOTAL : 2.274363 sec + 6,415,212,085 cycles # 2.816 GHz + 20,086,390,532 instructions # 3.13 insn per cycle + 2.288238797 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13452) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274861465384638E-004 Relative difference = 2.211071647257023e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.186032e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.197298e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.197298e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.756528 sec - 2,501,574,254 cycles # 3.300 GHz - 7,125,249,610 instructions # 2.85 insn per cycle - 0.758780713 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.636656e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.643300e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.643300e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 1.010969 sec + 2,918,129,602 cycles # 2.878 GHz + 7,130,827,098 instructions # 2.44 insn per cycle + 1.024648825 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:12261) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271939668077068E-004 Relative difference = 5.008498817890231e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.413858e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.425042e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.425042e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214978e-01 +- 3.255521e-01 ) GeV^-4 -TOTAL : 0.685713 sec - 2,267,843,391 cycles # 3.300 GHz - 6,433,455,899 instructions # 2.84 insn per cycle - 0.687939686 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.848024e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.856123e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.856123e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060560e+00 +- 2.367611e+00 ) GeV^-4 +TOTAL : 0.895519 sec + 2,583,274,132 cycles # 2.873 GHz + 6,439,451,842 instructions # 2.49 insn per cycle + 0.910176239 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11276) (512y: 27) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627194e-04 Avg ME (F77/C++) = 6.6271939668077068E-004 Relative difference = 5.008498817890231e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.907924e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.937842e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.937842e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.214981e-01 +- 3.255523e-01 ) GeV^-4 -TOTAL : 0.426031 sec - 1,409,552,540 cycles # 3.297 GHz - 3,420,395,875 instructions # 2.43 insn per cycle - 0.428204579 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.488982e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.494377e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.494377e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060562e+00 +- 2.367612e+00 ) GeV^-4 +TOTAL : 1.109477 sec + 2,120,739,457 cycles # 1.905 GHz + 3,428,489,642 instructions # 1.62 insn per cycle + 1.120804955 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2911) (512y: 22) (512z: 9647) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627195e-04 Avg ME (F77/C++) = 6.6271952032322112E-004 Relative difference = 3.066639970473621e-08 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt index 146b66385f..3e610d68fd 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:26:12 +DATE: 2024-03-01_03:07:56 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.548079e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.594396e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.599390e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.487762 sec + 2,117,397,644 cycles # 2.979 GHz + 3,170,491,357 instructions # 1.50 insn per cycle + 0.771619877 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.728616e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.789567e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.792128e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.852993 sec + 6,403,206,858 cycles # 3.066 GHz + 13,984,822,985 instructions # 2.18 insn per cycle + 2.145838793 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262660579844562E-004 +Relative difference = 2.836238137986709e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.142872e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.143601e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.143601e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.204932e-01 +- 3.252405e-01 ) GeV^-4 -TOTAL : 26.704464 sec - 93,392,949,985 cycles # 3.497 GHz - 135,135,739,845 instructions # 1.45 insn per cycle - 26.706712729 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:15558) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.747654e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.748466e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.748466e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 28.541681 sec + 87,683,123,741 cycles # 3.072 GHz + 135,626,627,328 instructions # 1.55 insn per cycle + 28.545959109 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:15563) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627534e-04 -Avg ME (F77/C++) = 6.6275340237237357E-004 -Relative difference = 3.579572077573998e-09 +Avg ME (F77/C++) = 6.6275340277317796E-004 +Relative difference = 4.184328521943034e-09 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.661989e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.676894e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.676894e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 -TOTAL : 1.900144 sec - 6,560,267,417 cycles # 3.450 GHz - 19,382,075,169 instructions # 2.95 insn per cycle - 1.902414328 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.148984e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.161699e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.161699e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 +TOTAL : 2.302428 sec + 6,776,067,855 cycles # 2.939 GHz + 19,386,467,667 instructions # 2.86 insn per cycle + 2.306810458 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:69681) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274862707273868E-004 Relative difference = 4.0849182767952624e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.751056e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.757067e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.757067e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 0.943073 sec - 3,117,335,618 cycles # 3.300 GHz - 6,803,703,724 instructions # 2.18 insn per cycle - 0.945302019 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.506728e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.512574e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.512574e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 1.096393 sec + 3,175,310,502 cycles # 2.890 GHz + 6,807,675,147 instructions # 2.14 insn per cycle + 1.100557110 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:49077) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 Avg ME (F77/C++) = 6.6272731558747466E-004 Relative difference = 2.3520194007978538e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.112187e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.120841e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.120841e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 0.782995 sec - 2,589,201,691 cycles # 3.300 GHz - 5,981,113,539 instructions # 2.31 insn per cycle - 0.785295392 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.815661e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.823746e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.823746e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 0.911313 sec + 2,641,911,907 cycles # 2.888 GHz + 5,985,989,672 instructions # 2.27 insn per cycle + 0.915610697 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:42677) (512y: 11) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 Avg ME (F77/C++) = 6.6272731558747466E-004 Relative difference = 2.3520194007978538e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.297217e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.318515e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.318515e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211848e-01 +- 3.254639e-01 ) GeV^-4 -TOTAL : 0.503966 sec - 1,665,820,543 cycles # 3.296 GHz - 3,493,880,654 instructions # 2.10 insn per cycle - 0.506230823 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.523255e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.528884e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.528884e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.084772 sec + 2,074,111,548 cycles # 1.906 GHz + 3,500,542,355 instructions # 1.69 insn per cycle + 1.089027435 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 5197) (512y: 3) (512z:44822) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627275e-04 Avg ME (F77/C++) = 6.6272750363879224E-004 Relative difference = 5.490631193034436e-09 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt index dd21065806..f668536073 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_f_inl1_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_f_inl1_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:26:49 +DATE: 2024-03-01_03:08:48 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.541557e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.588429e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.593399e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.059596e+00 +- 2.368053e+00 ) GeV^-4 +TOTAL : 0.485011 sec + 2,123,544,393 cycles # 3.007 GHz + 3,219,525,664 instructions # 1.52 insn per cycle + 0.766064420 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.637487e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.696462e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.698981e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.664703e+00 +- 5.072736e+00 ) GeV^-4 +TOTAL : 1.858325 sec + 6,401,876,626 cycles # 3.056 GHz + 13,834,352,039 instructions # 2.16 insn per cycle + 2.151127842 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626454e-04 +Avg ME (F77/CUDA) = 6.6262660579844562E-004 +Relative difference = 2.836238137986709e-05 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 6.387508e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 6.388300e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 6.388300e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.204932e-01 +- 3.252405e-01 ) GeV^-4 -TOTAL : 25.683491 sec - 89,826,358,356 cycles # 3.497 GHz - 135,208,960,358 instructions # 1.51 insn per cycle - 25.685677569 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:15804) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.762616e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.763465e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.763465e+02 ) sec^-1 +MeanMatrixElemValue = ( 4.059969e+00 +- 2.367799e+00 ) GeV^-4 +TOTAL : 28.469746 sec + 87,566,965,728 cycles # 3.076 GHz + 135,909,521,186 instructions # 1.55 insn per cycle + 28.473960910 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:15910) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627535e-04 -Avg ME (F77/C++) = 6.6275352716470975E-004 -Relative difference = 4.098765184605283e-08 +Avg ME (F77/C++) = 6.6275352674967369E-004 +Relative difference = 4.0361421941458736e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.437796e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.452057e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.452057e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.211992e-01 +- 3.254573e-01 ) GeV^-4 -TOTAL : 1.950080 sec - 6,654,162,703 cycles # 3.410 GHz - 19,433,960,536 instructions # 2.92 insn per cycle - 1.952366253 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.141246e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.153468e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.153468e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.059962e+00 +- 2.367792e+00 ) GeV^-4 +TOTAL : 2.304055 sec + 6,854,008,563 cycles # 2.972 GHz + 19,438,508,034 instructions # 2.84 insn per cycle + 2.308246423 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:69723) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627486e-04 Avg ME (F77/C++) = 6.6274862764021530E-004 Relative difference = 4.170542995014107e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.786843e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.793023e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.793023e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 0.924113 sec - 3,054,845,902 cycles # 3.300 GHz - 6,714,445,014 instructions # 2.20 insn per cycle - 0.926394819 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.543089e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.548736e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.548736e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 1.070827 sec + 3,111,432,280 cycles # 2.896 GHz + 6,718,585,544 instructions # 2.16 insn per cycle + 1.075017514 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:47667) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 Avg ME (F77/C++) = 6.6272731651051409E-004 Relative difference = 2.4912983202981302e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.132201e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.140940e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.140940e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211846e-01 +- 3.254638e-01 ) GeV^-4 -TOTAL : 0.775532 sec - 2,564,183,121 cycles # 3.300 GHz - 5,963,951,119 instructions # 2.33 insn per cycle - 0.777744409 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.837542e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.845711e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.845711e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060903e+00 +- 2.367376e+00 ) GeV^-4 +TOTAL : 0.900474 sec + 2,630,752,588 cycles # 2.910 GHz + 5,969,340,561 instructions # 2.27 insn per cycle + 0.904647261 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:41842) (512y: 13) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627273e-04 Avg ME (F77/C++) = 6.6272731651051409E-004 Relative difference = 2.4912983202981302e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=1] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=1] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.326443e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 3.348069e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 3.348069e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.211848e-01 +- 3.254639e-01 ) GeV^-4 -TOTAL : 0.499522 sec - 1,651,245,664 cycles # 3.296 GHz - 3,487,405,377 instructions # 2.11 insn per cycle - 0.501723648 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.526039e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.531935e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.531935e+04 ) sec^-1 +MeanMatrixElemValue = ( 4.060904e+00 +- 2.367377e+00 ) GeV^-4 +TOTAL : 1.083027 sec + 2,083,719,160 cycles # 1.918 GHz + 3,494,111,175 instructions # 1.68 insn per cycle + 1.087325959 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 4161) (512y: 4) (512z:44465) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_f_inl1_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.627275e-04 Avg ME (F77/C++) = 6.6272750384530066E-004 Relative difference = 5.80223501432476e-09 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt index f9ab256fce..8553820a52 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:14:11 +DATE: 2024-03-01_02:34:44 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.473478e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.502235e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.504525e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.522907 sec + 2,248,416,129 cycles # 2.981 GHz + 3,483,881,112 instructions # 1.55 insn per cycle + 0.829467781 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.123898e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.157734e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.159130e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.035491 sec + 10,039,386,860 cycles # 3.052 GHz + 22,522,898,713 instructions # 2.24 insn per cycle + 3.349083086 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266732376103494E-004 +Relative difference = 2.659538381540814e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.460441e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.461666e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.461666e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.673146 sec - 23,335,676,255 cycles # 3.496 GHz - 79,277,751,932 instructions # 3.40 insn per cycle - 6.675453883 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4801) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.952639e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.953615e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.953615e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.409354 sec + 25,927,870,734 cycles # 3.082 GHz + 79,436,480,305 instructions # 3.06 insn per cycle + 8.416137774 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4858) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.686446e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.690711e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.690711e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.506769 sec - 11,577,360,778 cycles # 3.300 GHz - 38,543,055,165 instructions # 3.33 insn per cycle - 3.509025941 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.739028e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.742372e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.742372e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.395641 sec + 12,641,926,900 cycles # 2.873 GHz + 38,549,360,435 instructions # 3.05 insn per cycle + 4.411574958 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:13163) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.172980e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.175482e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.175482e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.405389 sec - 4,642,988,163 cycles # 3.300 GHz - 13,474,033,843 instructions # 2.90 insn per cycle - 1.407688497 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.720558e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.737987e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.737987e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.889905 sec + 5,503,418,397 cycles # 2.905 GHz + 13,481,227,468 instructions # 2.45 insn per cycle + 1.901949052 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11242) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.284064e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.287100e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.287100e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.284493 sec - 4,244,001,962 cycles # 3.300 GHz - 12,130,560,495 instructions # 2.86 insn per cycle - 1.286759541 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.817789e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.841302e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.841302e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.679659 sec + 4,858,057,374 cycles # 2.885 GHz + 12,135,455,571 instructions # 2.50 insn per cycle + 1.694768152 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10154) (512y: 79) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.007453e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.014936e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.014936e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.824364 sec - 2,723,570,767 cycles # 3.298 GHz - 6,306,786,489 instructions # 2.32 insn per cycle - 0.826694116 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.171224e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.183880e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.183880e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.297248 sec + 4,143,595,621 cycles # 1.801 GHz + 6,336,694,490 instructions # 1.53 insn per cycle + 2.312628428 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1803) (512y: 93) (512z: 9358) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 diff --git a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt index 0e2a0168d0..44d560fb63 100644 --- a/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttgg_mad/log_ggttgg_mad_m_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg' -DATE: 2024-03-01_19:14:29 +DATE: 2024-03-01_02:35:21 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.474402e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.502829e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.505143e+05 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 0.522485 sec + 2,266,664,443 cycles # 3.011 GHz + 3,552,942,464 instructions # 1.57 insn per cycle + 0.824080628 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.147340e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.181695e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.182993e+05 ) sec^-1 +MeanMatrixElemValue = ( 6.665112e+00 +- 5.002651e+00 ) GeV^-4 +TOTAL : 3.023944 sec + 10,029,910,184 cycles # 3.059 GHz + 21,497,951,661 instructions # 2.14 insn per cycle + 3.338904131 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 6.626675e-04 +Avg ME (F77/CUDA) = 6.6266732376103494E-004 +Relative difference = 2.659538381540814e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.465973e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.467206e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.467206e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 6.658248 sec - 23,291,569,184 cycles # 3.498 GHz - 79,237,899,390 instructions # 3.40 insn per cycle - 6.660539391 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 4465) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.924823e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.925747e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.925747e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 8.531114 sec + 25,939,606,781 cycles # 3.040 GHz + 79,447,311,630 instructions # 3.06 insn per cycle + 8.537643841 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 4505) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266731406016235E-004 Relative difference = 2.8059296349552523e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.697632e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.701963e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.701963e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 3.498479 sec - 11,550,565,484 cycles # 3.300 GHz - 38,513,812,102 instructions # 3.33 insn per cycle - 3.500917728 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.758654e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.761985e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.761985e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 4.372440 sec + 12,693,692,693 cycles # 2.901 GHz + 38,521,475,204 instructions # 3.03 insn per cycle + 4.385193423 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:12930) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730246908442E-004 Relative difference = 2.98084507782618e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.152534e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.154954e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.154954e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.430198 sec - 4,725,017,333 cycles # 3.300 GHz - 13,599,869,443 instructions # 2.88 insn per cycle - 1.432524334 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.635318e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.652109e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.652109e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.908191 sec + 5,531,901,200 cycles # 2.893 GHz + 13,605,961,475 instructions # 2.46 insn per cycle + 1.920337987 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:11327) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.271343e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.274342e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.274342e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 1.297199 sec - 4,286,083,032 cycles # 3.300 GHz - 12,263,686,047 instructions # 2.86 insn per cycle - 1.299546744 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.704499e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.725961e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.725961e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 1.699452 sec + 4,910,284,170 cycles # 2.883 GHz + 12,271,024,564 instructions # 2.50 insn per cycle + 1.712563313 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:10143) (512y: 239) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe -p 64 256 1 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.010301e+04 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.017802e+04 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.017802e+04 ) sec^-1 -MeanMatrixElemValue = ( 4.197467e-01 +- 3.250467e-01 ) GeV^-4 -TOTAL : 0.823170 sec - 2,718,854,468 cycles # 3.297 GHz - 6,413,166,633 instructions # 2.36 insn per cycle - 0.825525653 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.567240e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.580886e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.580886e+03 ) sec^-1 +MeanMatrixElemValue = ( 4.063123e+00 +- 2.368970e+00 ) GeV^-4 +TOTAL : 2.177959 sec + 4,164,411,217 cycles # 1.910 GHz + 6,442,301,345 instructions # 1.55 insn per cycle + 2.190574077 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1628) (512y: 191) (512z: 9356) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 6.626675e-04 Avg ME (F77/C++) = 6.6266730409276857E-004 Relative difference = 2.956342832710188e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt index 3e8c727bbe..93119c7539 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_19:15:32 +DATE: 2024-03-01_02:37:42 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.065457e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.065836e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.065940e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.448496 sec + 8,082,390,398 cycles # 2.946 GHz + 16,852,562,382 instructions # 2.09 insn per cycle + 2.848455369 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.245006e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.247251e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.247453e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 4.002127 sec + 13,348,526,839 cycles # 3.088 GHz + 31,140,905,358 instructions # 2.33 insn per cycle + 4.382097820 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.872263e-03 +Avg ME (F77/CUDA) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.007917e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.007943e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.007943e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.242575 sec - 18,324,214,896 cycles # 3.494 GHz - 53,656,205,350 instructions # 2.93 insn per cycle - 5.244757907 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32534) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.053587e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.053836e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.053836e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.566168 sec + 18,831,689,747 cycles # 2.868 GHz + 53,916,332,004 instructions # 2.86 insn per cycle + 6.572689464 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.907110e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.907199e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.907199e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.771308 sec - 9,690,965,867 cycles # 3.495 GHz - 27,085,067,925 instructions # 2.79 insn per cycle - 2.773516868 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.663489e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.663581e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.663581e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.182674 sec + 9,806,871,766 cycles # 3.081 GHz + 27,093,022,297 instructions # 2.76 insn per cycle + 3.192772007 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.253444e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.253870e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.253870e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.243280 sec - 4,107,456,413 cycles # 3.300 GHz - 9,554,437,400 instructions # 2.33 insn per cycle - 1.245464164 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.630162e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.630605e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.630605e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.462430 sec + 4,231,767,010 cycles # 2.892 GHz + 9,562,001,834 instructions # 2.26 insn per cycle + 1.472832936 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.901436e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.902024e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.902024e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.079126 sec - 3,565,937,704 cycles # 3.300 GHz - 8,478,433,637 instructions # 2.38 insn per cycle - 1.081341215 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.135973e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.136556e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.136556e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.282131 sec + 3,734,243,960 cycles # 2.905 GHz + 8,486,594,514 instructions # 2.27 insn per cycle + 1.294140643 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.570541e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.572308e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.572308e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 0.618837 sec - 2,038,410,234 cycles # 3.287 GHz - 4,264,215,045 instructions # 2.09 insn per cycle - 0.620988332 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.702281e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.702851e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.702851e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.432645 sec + 2,701,519,987 cycles # 1.882 GHz + 4,274,080,381 instructions # 1.58 insn per cycle + 1.444722496 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt index 898ee4858d..7163808f45 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd0_bridge.txt @@ -1,188 +1,237 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_19:31:25 +DATE: 2024-03-01_03:17:34 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.068445e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.069395e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.069395e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.373786 sec + 8,212,794,649 cycles # 3.050 GHz + 17,373,508,782 instructions # 2.12 insn per cycle + 2.749788140 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.191805e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.223957e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.223957e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.992060 sec + 13,207,906,873 cycles # 3.062 GHz + 30,525,969,027 instructions # 2.31 insn per cycle + 4.371813741 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.872263e-03 +Avg ME (F77/CUDA) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.008044e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.008070e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.008070e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.240399 sec - 18,320,301,067 cycles # 3.495 GHz - 53,657,376,785 instructions # 2.93 insn per cycle - 5.242561648 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32534) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.148706e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.148931e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.148931e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.484661 sec + 18,737,465,302 cycles # 2.888 GHz + 53,915,906,594 instructions # 2.88 insn per cycle + 6.488680620 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32448) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.907377e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.907467e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.907467e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.770516 sec - 9,697,495,185 cycles # 3.498 GHz - 27,086,113,073 instructions # 2.79 insn per cycle - 2.772701601 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.664837e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.664944e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.664944e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.177972 sec + 9,794,551,146 cycles # 3.079 GHz + 27,093,049,280 instructions # 2.77 insn per cycle + 3.182112356 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96443) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.249825e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.250273e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.250273e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.244792 sec - 4,112,966,387 cycles # 3.300 GHz - 9,555,785,054 instructions # 2.32 insn per cycle - 1.246985323 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.541461e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.541883e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.541883e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.495047 sec + 4,300,282,840 cycles # 2.870 GHz + 9,561,701,370 instructions # 2.22 insn per cycle + 1.499121189 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84390) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.901276e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.901882e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.901882e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.079264 sec - 3,566,506,063 cycles # 3.300 GHz - 8,479,328,463 instructions # 2.38 insn per cycle - 1.081393203 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.118490e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.119048e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.119048e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.287264 sec + 3,730,461,014 cycles # 2.891 GHz + 8,485,603,542 instructions # 2.27 insn per cycle + 1.291227222 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79991) (512y: 91) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.552403e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.554214e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.554214e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 0.620126 sec - 2,047,589,081 cycles # 3.294 GHz - 4,264,912,626 instructions # 2.08 insn per cycle - 0.622279806 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.742786e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.743427e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.743427e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.415968 sec + 2,690,639,160 cycles # 1.896 GHz + 4,273,336,878 instructions # 1.59 insn per cycle + 1.420067464 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2284) (512y: 105) (512z:79105) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt index 6a223c1182..fcaae9673e 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_d_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_19:15:57 +DATE: 2024-03-01_02:38:46 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.066781e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.067205e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.067339e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 2.446944 sec + 8,408,759,874 cycles # 3.068 GHz + 18,673,492,162 instructions # 2.22 insn per cycle + 2.843675081 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 9.258123e+03 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.260337e+03 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.260588e+03 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.986190 sec + 13,309,313,958 cycles # 3.084 GHz + 29,253,936,467 instructions # 2.20 insn per cycle + 4.370982628 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.872263e-03 +Avg ME (F77/CUDA) = 9.8722595284406640E-003 +Relative difference = 3.5164777671934515e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.978714e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.978965e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.978965e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.293741 sec - 18,509,726,309 cycles # 3.496 GHz - 53,668,966,356 instructions # 2.90 insn per cycle - 5.295890020 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32178) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.505940e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.506196e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.506196e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.219195 sec + 18,809,079,145 cycles # 3.025 GHz + 53,925,834,666 instructions # 2.87 insn per cycle + 6.232860023 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32063) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.900218e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.900307e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.900307e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.781026 sec - 9,726,233,315 cycles # 3.496 GHz - 27,082,128,443 instructions # 2.78 insn per cycle - 2.783220311 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.661174e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.661266e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.661266e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 3.189478 sec + 9,805,870,159 cycles # 3.076 GHz + 27,091,831,447 instructions # 2.76 insn per cycle + 3.203897537 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96286) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285514851E-003 Relative difference = 3.5163655122073967e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.269104e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.269539e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.269539e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.238661 sec - 4,092,373,675 cycles # 3.300 GHz - 9,554,221,469 instructions # 2.33 insn per cycle - 1.240854013 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.622791e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.623217e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.623217e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.464714 sec + 4,224,699,489 cycles # 2.882 GHz + 9,562,401,622 instructions # 2.26 insn per cycle + 1.476328883 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84478) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.901596e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.902173e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.902173e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.079028 sec - 3,565,293,300 cycles # 3.300 GHz - 8,478,411,533 instructions # 2.38 insn per cycle - 1.081182403 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.104704e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.105332e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.105332e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.294499 sec + 3,723,740,700 cycles # 2.874 GHz + 8,486,051,495 instructions # 2.28 insn per cycle + 1.308410916 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:80014) (512y: 241) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.548194e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.549944e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.549944e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 0.620448 sec - 2,042,307,043 cycles # 3.284 GHz - 4,267,534,527 instructions # 2.09 insn per cycle - 0.622543731 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.737812e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.738457e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.738457e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 1.421818 sec + 2,699,411,216 cycles # 1.899 GHz + 4,277,531,970 instructions # 1.58 insn per cycle + 1.435104148 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2169) (512y: 187) (512z:79110) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595285411531E-003 Relative difference = 3.516375977906115e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt index 4969158cd6..e89ab34326 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_19:16:21 +DATE: 2024-03-01_02:39:49 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.768224e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.769082e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.769342e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 +TOTAL : 1.706494 sec + 5,724,877,835 cycles # 2.946 GHz + 11,350,286,337 instructions # 1.98 insn per cycle + 2.064496697 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.316243e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.317022e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.317120e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 +TOTAL : 1.926202 sec + 6,794,636,243 cycles # 3.076 GHz + 13,931,883,029 instructions # 2.05 insn per cycle + 2.265774235 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.849636e-03 +Avg ME (F77/CUDA) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.040188e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.040217e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.040217e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 5.079204 sec - 17,758,838,927 cycles # 3.495 GHz - 53,421,541,162 instructions # 3.01 insn per cycle - 5.081388085 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20403) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.967764e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.968029e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.968029e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 5.899633 sec + 18,012,008,843 cycles # 3.055 GHz + 53,588,806,253 instructions # 2.98 insn per cycle + 5.906269981 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087545108E-003 -Relative difference = 2.11977393295785e-08 +Avg ME (F77/C++) = 9.8479612087541066E-003 +Relative difference = 2.1197698286506752e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.216869e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.217343e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.217343e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924321e-03 +- 4.918774e-03 ) GeV^-6 -TOTAL : 1.254841 sec - 4,386,128,209 cycles # 3.492 GHz - 13,755,443,722 instructions # 3.14 insn per cycle - 1.257011070 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.554445e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.554907e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.554907e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 +TOTAL : 1.492504 sec + 4,596,969,768 cycles # 3.077 GHz + 13,763,413,131 instructions # 2.99 insn per cycle + 1.508036951 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 Avg ME (F77/C++) = 9.8479546896527003E-003 Relative difference = 3.151388282563952e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.433176e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.435080e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.435080e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.628179 sec - 2,077,257,351 cycles # 3.300 GHz - 4,810,210,256 instructions # 2.32 insn per cycle - 0.630303522 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.129307e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.130988e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.130988e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.749250 sec + 2,146,538,234 cycles # 2.864 GHz + 4,817,770,938 instructions # 2.24 insn per cycle + 0.763621351 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161070551E-003 Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.775402e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.777862e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.777862e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.542106 sec - 1,793,648,403 cycles # 3.300 GHz - 4,267,307,526 instructions # 2.38 insn per cycle - 0.544249464 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.184924e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.187225e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.187225e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.652928 sec + 1,865,233,671 cycles # 2.849 GHz + 4,274,819,205 instructions # 2.29 insn per cycle + 0.666710238 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161070551E-003 Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.719714e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.720497e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.720497e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.946834e-03 +- 4.941266e-03 ) GeV^-6 -TOTAL : 0.309831 sec - 1,024,905,660 cycles # 3.295 GHz - 2,150,921,588 instructions # 2.10 insn per cycle - 0.312019958 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.469221e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.471533e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.471533e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 +TOTAL : 0.715424 sec + 1,360,172,621 cycles # 1.900 GHz + 2,159,744,323 instructions # 1.59 insn per cycle + 0.729957103 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2877) (512y: 49) (512z:79298) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 Avg ME (F77/C++) = 9.8929811982958280E-003 Relative difference = 2.0044092642523172e-08 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt index 1862b6e1e8..684ca24c1f 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd0_bridge.txt @@ -1,188 +1,237 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_19:31:49 +DATE: 2024-03-01_03:18:37 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 2 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.798857e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.800593e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.800593e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187094e-05 +- 9.825664e-06 ) GeV^-6 +TOTAL : 1.598425 sec + 5,724,594,753 cycles # 3.063 GHz + 12,186,790,592 instructions # 2.13 insn per cycle + 1.928350107 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 1 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +WARNING! Set grid in Bridge (nevt=256, gpublocks=1, gputhreads=256, gpublocks*gputhreads=256) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.285950e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.298387e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.298387e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856441e-04 +- 8.331096e-05 ) GeV^-6 +TOTAL : 1.887489 sec + 6,620,617,732 cycles # 3.045 GHz + 14,303,245,528 instructions # 2.16 insn per cycle + 2.231962749 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.849636e-03 +Avg ME (F77/CUDA) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.041119e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.041148e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.041148e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 5.074721 sec - 17,746,301,498 cycles # 3.497 GHz - 53,422,248,318 instructions # 3.01 insn per cycle - 5.076826933 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20403) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 9.094412e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 9.094687e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 9.094687e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 5.812831 sec + 17,931,583,834 cycles # 3.083 GHz + 53,588,775,363 instructions # 2.99 insn per cycle + 5.816760256 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:20208) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087545108E-003 -Relative difference = 2.11977393295785e-08 +Avg ME (F77/C++) = 9.8479612087541066E-003 +Relative difference = 2.1197698286506752e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.218426e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.218902e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.218902e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924321e-03 +- 4.918774e-03 ) GeV^-6 -TOTAL : 1.254452 sec - 4,388,322,961 cycles # 3.494 GHz - 13,756,441,763 instructions # 3.13 insn per cycle - 1.256579192 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.573130e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.573569e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.573569e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 +TOTAL : 1.483014 sec + 4,585,157,051 cycles # 3.085 GHz + 13,762,636,955 instructions # 3.00 insn per cycle + 1.487033664 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96986) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 Avg ME (F77/C++) = 9.8479546896527003E-003 Relative difference = 3.151388282563952e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.453965e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.455916e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.455916e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.626630 sec - 2,072,617,839 cycles # 3.300 GHz - 4,811,160,652 instructions # 2.32 insn per cycle - 0.628769204 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.234993e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.236702e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.236702e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.734407 sec + 2,124,324,714 cycles # 2.880 GHz + 4,817,114,861 instructions # 2.27 insn per cycle + 0.738469635 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84904) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161070551E-003 Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.777854e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.780436e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.780436e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.541994 sec - 1,793,603,060 cycles # 3.300 GHz - 4,268,281,709 instructions # 2.38 insn per cycle - 0.544156652 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.746826e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.748881e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.748881e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.686036 sec + 1,868,608,359 cycles # 2.710 GHz + 4,274,464,507 instructions # 2.29 insn per cycle + 0.690085324 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:80610) (512y: 46) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161070551E-003 Relative difference = 1.858823877057982e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe -p 1 256 2 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=256) -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.717939e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.718720e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.718720e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.946834e-03 +- 4.941266e-03 ) GeV^-6 -TOTAL : 0.310080 sec - 1,025,575,344 cycles # 3.292 GHz - 2,151,895,701 instructions # 2.10 insn per cycle - 0.312200294 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.587479e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.589999e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.589999e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 +TOTAL : 0.701778 sec + 1,356,865,477 cycles # 1.924 GHz + 2,159,196,207 instructions # 1.59 insn per cycle + 0.705773287 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2877) (512y: 49) (512z:79298) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 Avg ME (F77/C++) = 9.8929811982958280E-003 Relative difference = 2.0044092642523172e-08 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt index 0d3fb1c86c..2af18ad9d5 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_f_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_19:16:40 +DATE: 2024-03-01_02:40:36 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 6.765595e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.766455e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.766757e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.186984e-05 +- 9.824899e-06 ) GeV^-6 +TOTAL : 1.693781 sec + 5,858,518,501 cycles # 3.029 GHz + 12,487,165,720 instructions # 2.13 insn per cycle + 2.044833380 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.312075e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.312852e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.312969e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856829e-04 +- 8.333435e-05 ) GeV^-6 +TOTAL : 1.933893 sec + 6,737,061,424 cycles # 3.047 GHz + 14,801,104,127 instructions # 2.20 insn per cycle + 2.267780802 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.849636e-03 +Avg ME (F77/CUDA) = 9.8712405367667715E-003 +Relative difference = 0.0021934350433631634 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.032406e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.032434e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.032434e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924324e-03 +- 4.918778e-03 ) GeV^-6 -TOTAL : 5.116727 sec - 17,886,024,738 cycles # 3.495 GHz - 53,426,547,569 instructions # 2.99 insn per cycle - 5.118914313 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:20415) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.922433e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.922702e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.922702e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825040e-06 ) GeV^-6 +TOTAL : 5.925615 sec + 17,989,215,363 cycles # 3.036 GHz + 53,579,777,630 instructions # 2.98 insn per cycle + 5.931642569 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:20207) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847961e-03 -Avg ME (F77/C++) = 9.8479612087257751E-003 -Relative difference = 2.119482139617284e-08 +Avg ME (F77/C++) = 9.8479612087582491E-003 +Relative difference = 2.1198118933954545e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.256894e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.257370e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.257370e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.924321e-03 +- 4.918774e-03 ) GeV^-6 -TOTAL : 1.243013 sec - 4,349,759,961 cycles # 3.495 GHz - 13,748,864,713 instructions # 3.16 insn per cycle - 1.245159087 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.564689e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.565144e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.565144e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187013e-05 +- 9.825037e-06 ) GeV^-6 +TOTAL : 1.490731 sec + 4,558,556,123 cycles # 3.055 GHz + 13,757,084,226 instructions # 3.02 insn per cycle + 1.501811120 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96606) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.847955e-03 Avg ME (F77/C++) = 9.8479546896225560E-003 Relative difference = 3.151694379513441e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.499112e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.501031e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.501031e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.623063 sec - 2,060,898,442 cycles # 3.300 GHz - 4,812,404,156 instructions # 2.34 insn per cycle - 0.625176375 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.177084e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.178836e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.178836e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.743943 sec + 2,139,817,263 cycles # 2.875 GHz + 4,819,936,629 instructions # 2.25 insn per cycle + 0.755587883 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:85359) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161070967E-003 Relative difference = 1.8588234562202478e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.724133e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.726548e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.726548e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.946830e-03 +- 4.941261e-03 ) GeV^-6 -TOTAL : 0.544853 sec - 1,802,924,271 cycles # 3.300 GHz - 4,269,301,578 instructions # 2.37 insn per cycle - 0.547035634 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.229829e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.232369e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.232369e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826767e-06 ) GeV^-6 +TOTAL : 0.647666 sec + 1,869,906,105 cycles # 2.875 GHz + 4,276,791,956 instructions # 2.29 insn per cycle + 0.664053491 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:81075) (512y: 26) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892973e-03 Avg ME (F77/C++) = 9.8929728161070967E-003 Relative difference = 1.8588234562202478e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.697678e+03 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.698411e+03 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.698411e+03 ) sec^-1 -MeanMatrixElemValue = ( 4.946834e-03 +- 4.941266e-03 ) GeV^-6 -TOTAL : 0.313529 sec - 1,032,832,833 cycles # 3.280 GHz - 2,157,258,957 instructions # 2.09 insn per cycle - 0.315697429 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.437378e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.439646e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.439646e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187188e-05 +- 9.826771e-06 ) GeV^-6 +TOTAL : 0.718650 sec + 1,366,457,842 cycles # 1.901 GHz + 2,166,062,692 instructions # 1.59 insn per cycle + 0.731356674 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3474) (512y: 34) (512z:79492) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.892981e-03 Avg ME (F77/C++) = 9.8929811982955140E-003 Relative difference = 2.0044060904369713e-08 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt index 8c752511ca..c639834643 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_19:17:00 +DATE: 2024-03-01_02:41:23 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.691286e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.691795e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.691928e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 2.198692 sec + 7,604,134,018 cycles # 3.054 GHz + 16,321,512,266 instructions # 2.15 insn per cycle + 2.594812497 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.112457e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.112776e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112803e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.397194 sec + 11,475,121,938 cycles # 3.084 GHz + 26,000,925,285 instructions # 2.27 insn per cycle + 3.777191130 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.872263e-03 +Avg ME (F77/CUDA) = 9.8722599015656498E-003 +Relative difference = 3.1385249252060663e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.888048e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.888295e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.888295e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.342426 sec - 18,680,140,983 cycles # 3.496 GHz - 53,894,914,129 instructions # 2.89 insn per cycle - 5.344498912 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32196) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 8.034566e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 8.034790e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 8.034790e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.578920 sec + 19,096,747,933 cycles # 2.903 GHz + 54,154,360,803 instructions # 2.84 insn per cycle + 6.585797711 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32067) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.973916e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.974013e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.974013e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.677279 sec - 9,359,247,427 cycles # 3.494 GHz - 26,144,863,285 instructions # 2.79 insn per cycle - 2.679446649 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.634173e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.634271e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.634271e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 3.239396 sec + 9,369,032,238 cycles # 2.892 GHz + 26,160,172,444 instructions # 2.79 insn per cycle + 3.251135271 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:96007) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.494886e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.495366e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.495366e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.176795 sec - 3,888,164,225 cycles # 3.300 GHz - 9,214,766,456 instructions # 2.37 insn per cycle - 1.178930781 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.697087e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.697545e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.697545e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.438333 sec + 4,079,178,507 cycles # 2.840 GHz + 9,228,646,226 instructions # 2.26 insn per cycle + 1.450605350 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:84155) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.132578e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.133212e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.133212e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.030734 sec - 3,406,124,705 cycles # 3.300 GHz - 8,162,197,379 instructions # 2.40 insn per cycle - 1.032894212 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.363646e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.364393e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.364393e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.218747 sec + 3,509,445,956 cycles # 2.879 GHz + 8,176,263,750 instructions # 2.33 insn per cycle + 1.230057623 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79844) (512y: 79) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.830953e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.832825e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.832825e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 0.600646 sec - 1,980,221,274 cycles # 3.289 GHz - 4,145,631,132 instructions # 2.09 insn per cycle - 0.602788892 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.850358e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.851005e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.851005e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.381042 sec + 2,620,845,167 cycles # 1.898 GHz + 4,155,618,865 instructions # 1.59 insn per cycle + 1.395419124 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2046) (512y: 93) (512z:78760) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 diff --git a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt index 04dd6516f8..ace04f97d7 100644 --- a/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_ggttggg_mad/log_ggttggg_mad_m_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg' -DATE: 2024-03-01_19:17:24 +DATE: 2024-03-01_02:42:25 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 2 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 4.691636e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.692217e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.692361e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 2.171682 sec + 7,616,890,265 cycles # 3.058 GHz + 16,356,089,453 instructions # 2.15 insn per cycle + 2.553555988 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 1 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GG_TTXGGG_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 1.106871e+04 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.107188e+04 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.107217e+04 ) sec^-1 +MeanMatrixElemValue = ( 1.856249e-04 +- 8.329951e-05 ) GeV^-6 +TOTAL : 3.406322 sec + 11,260,210,288 cycles # 3.017 GHz + 25,906,087,343 instructions # 2.30 insn per cycle + 3.788413520 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 9.872263e-03 +Avg ME (F77/CUDA) = 9.8722599015656498E-003 +Relative difference = 3.1385249252060663e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 9.833294e+01 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.833543e+01 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.833543e+01 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 5.373246 sec - 18,782,693,778 cycles # 3.495 GHz - 53,895,950,177 instructions # 2.87 insn per cycle - 5.375414541 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4:32348) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 7.951672e+01 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.951882e+01 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.951882e+01 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825549e-06 ) GeV^-6 +TOTAL : 6.644473 sec + 19,262,229,911 cycles # 2.898 GHz + 54,152,472,780 instructions # 2.81 insn per cycle + 6.648593616 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4:32244) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722595861831675E-003 Relative difference = 3.457988134687711e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.984004e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.984101e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.984101e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 2.663925 sec - 9,315,048,345 cycles # 3.495 GHz - 26,065,146,110 instructions # 2.80 insn per cycle - 2.666137308 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.623003e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.623092e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.623092e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 3.257928 sec + 9,349,757,536 cycles # 2.867 GHz + 26,077,919,393 instructions # 2.79 insn per cycle + 3.270643449 seconds time elapsed =Symbols in CPPProcess.o= (~sse4:95901) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594844308162E-003 Relative difference = 3.5610570575237004e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.435411e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.435873e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.435873e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.192345 sec - 3,939,476,607 cycles # 3.300 GHz - 9,201,069,306 instructions # 2.34 insn per cycle - 1.194498730 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.760154e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.760626e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.760626e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.408906 sec + 4,059,558,991 cycles # 2.874 GHz + 9,213,876,384 instructions # 2.27 insn per cycle + 1.420092908 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:83776) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.088546e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.089160e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.089160e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 1.039445 sec - 3,434,917,766 cycles # 3.300 GHz - 8,155,176,688 instructions # 2.37 insn per cycle - 1.041616665 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.304001e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.304638e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.304638e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.231479 sec + 3,558,951,872 cycles # 2.881 GHz + 8,168,148,330 instructions # 2.30 insn per cycle + 1.241837128 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2:79373) (512y: 229) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe -p 1 256 2 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GG_TTXGGG_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.811569e+02 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.813385e+02 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.813385e+02 ) sec^-1 -MeanMatrixElemValue = ( 4.936475e-03 +- 4.930917e-03 ) GeV^-6 -TOTAL : 0.601983 sec - 1,984,653,054 cycles # 3.289 GHz - 4,144,401,294 instructions # 2.09 insn per cycle - 0.604132820 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.836982e+02 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.837574e+02 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.837574e+02 ) sec^-1 +MeanMatrixElemValue = ( 1.187066e-05 +- 9.825548e-06 ) GeV^-6 +TOTAL : 1.381601 sec + 2,619,896,392 cycles # 1.892 GHz + 4,153,497,129 instructions # 1.59 insn per cycle + 1.390536918 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1493) (512y: 175) (512z:78776) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gg_ttggg.mad/SubProcesses/P1_gg_ttxggg/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 9.872263e-03 Avg ME (F77/C++) = 9.8722594324461913E-003 Relative difference = 3.613714310412983e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt index 2cec001807..4f705cbffa 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_19:14:48 +DATE: 2024-03-01_02:35:57 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.695225e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.365990e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.743234e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.446213 sec + 1,972,017,701 cycles # 2.992 GHz + 2,778,256,208 instructions # 1.41 insn per cycle + 0.734930275 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.267244e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.134450e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.554945e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.528224 sec + 2,304,762,750 cycles # 3.008 GHz + 3,294,040,641 instructions # 1.43 insn per cycle + 0.823439197 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.390238e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.419308e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.419308e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.204060 sec - 4,213,611,842 cycles # 3.495 GHz - 13,434,873,181 instructions # 3.19 insn per cycle - 1.206434287 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 864) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.091452e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.114280e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.114280e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.522856 sec + 4,703,604,569 cycles # 3.081 GHz + 13,462,460,024 instructions # 2.86 insn per cycle + 1.529442917 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499484 Relative difference = 5.286896509487005e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.533543e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.629343e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.629343e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.671073 sec - 2,349,549,529 cycles # 3.493 GHz - 7,542,124,614 instructions # 3.21 insn per cycle - 0.673394223 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.951069e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.025448e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.025448e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.861454 sec + 2,622,516,081 cycles # 3.029 GHz + 7,553,226,055 instructions # 2.88 insn per cycle + 0.875162721 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499478 Relative difference = 5.28689651338321e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.512664e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.794644e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.794644e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.387185 sec - 1,289,012,082 cycles # 3.315 GHz - 3,105,638,031 instructions # 2.41 insn per cycle - 0.389505293 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.378326e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.598362e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.598362e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.506903 sec + 1,479,878,074 cycles # 2.896 GHz + 3,120,545,502 instructions # 2.11 insn per cycle + 0.521612120 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.739873e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.054260e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.054260e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.370616 sec - 1,234,624,013 cycles # 3.316 GHz - 2,961,729,590 instructions # 2.40 insn per cycle - 0.372993568 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.763846e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.033394e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.033394e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.456990 sec + 1,342,026,946 cycles # 2.909 GHz + 2,982,806,139 instructions # 2.22 insn per cycle + 0.473253864 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.362372e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.769155e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.769155e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.330642 sec - 1,102,117,372 cycles # 3.316 GHz - 1,932,728,355 instructions # 1.75 insn per cycle - 0.333111717 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.552530e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.674072e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.674072e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.665523 sec + 1,326,336,546 cycles # 1.981 GHz + 1,954,248,677 instructions # 1.47 insn per cycle + 0.676015017 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt index bb3a843a97..7838899130 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd0_bridge.txt @@ -1,188 +1,237 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_19:30:37 +DATE: 2024-03-01_03:15:54 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.566228e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.132243e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.132243e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.471075 sec + 2,051,009,542 cycles # 3.009 GHz + 3,055,349,974 instructions # 1.49 insn per cycle + 0.738770181 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:DBL+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.288005e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.253544e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.253544e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.748132 sec + 3,046,262,026 cycles # 3.023 GHz + 4,636,082,832 instructions # 1.52 insn per cycle + 1.065675268 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.391136e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.420435e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.420435e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.207326 sec - 4,222,941,203 cycles # 3.492 GHz - 13,442,631,396 instructions # 3.18 insn per cycle - 1.209898020 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 864) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.089966e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.112868e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.112868e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.529900 sec + 4,728,814,715 cycles # 3.083 GHz + 13,467,526,764 instructions # 2.85 insn per cycle + 1.534252544 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 860) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499484 Relative difference = 5.286896509487005e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.522907e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.617976e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.617976e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.678477 sec - 2,374,544,352 cycles # 3.491 GHz - 7,592,390,633 instructions # 3.20 insn per cycle - 0.680996171 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.949285e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.024056e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.024056e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.869004 sec + 2,652,875,861 cycles # 3.039 GHz + 7,602,145,003 instructions # 2.87 insn per cycle + 0.873736497 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3099) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499478 Relative difference = 5.28689651338321e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.487640e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.768436e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.768436e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.394135 sec - 1,312,471,214 cycles # 3.316 GHz - 3,157,137,064 instructions # 2.41 insn per cycle - 0.396629875 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.146841e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.351542e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.351542e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.550316 sec + 1,514,222,662 cycles # 2.732 GHz + 3,170,467,422 instructions # 2.09 insn per cycle + 0.554802806 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2917) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.703184e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.017690e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.017690e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.378206 sec - 1,260,283,982 cycles # 3.317 GHz - 3,011,528,674 instructions # 2.39 insn per cycle - 0.380880323 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.650572e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.918840e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.918840e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.478096 sec + 1,374,122,120 cycles # 2.850 GHz + 3,032,631,270 instructions # 2.21 insn per cycle + 0.482825268 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2694) (512y: 104) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.332880e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.727962e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.727962e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.336607 sec - 1,122,764,946 cycles # 3.319 GHz - 1,968,890,958 instructions # 1.75 insn per cycle - 0.338989281 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.537453e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.662993e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.662993e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.675099 sec + 1,354,490,621 cycles # 1.996 GHz + 1,991,409,834 instructions # 1.47 insn per cycle + 0.679620955 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1372) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt index 72f1443440..1de3a7df55 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_d_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_d_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_19:14:56 +DATE: 2024-03-01_02:36:15 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.634258e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.200936e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.553712e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.443315 sec + 2,012,981,464 cycles # 3.013 GHz + 2,802,025,362 instructions # 1.39 insn per cycle + 0.744859677 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:DBL+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = DOUBLE (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.239420e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.026633e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.428795e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.526694 sec + 2,300,725,267 cycles # 3.007 GHz + 3,244,738,845 instructions # 1.41 insn per cycle + 0.822736768 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482467490466 +Relative difference = 5.286902838873106e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.382004e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.410796e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.410796e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.210873 sec - 4,235,605,729 cycles # 3.494 GHz - 13,439,420,817 instructions # 3.17 insn per cycle - 1.213116092 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 853) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.093034e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.115683e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.115683e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.520645 sec + 4,710,102,553 cycles # 3.090 GHz + 13,456,334,828 instructions # 2.86 insn per cycle + 1.527404362 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 849) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499484 Relative difference = 5.286896509487005e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.515723e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.608916e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.608916e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.675572 sec - 2,364,775,142 cycles # 3.493 GHz - 7,541,520,259 instructions # 3.19 insn per cycle - 0.677865882 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.995699e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.070809e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.070809e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.841713 sec + 2,618,818,041 cycles # 3.096 GHz + 7,552,217,415 instructions # 2.88 insn per cycle + 0.854217946 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3092) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467499478 Relative difference = 5.28689651338321e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.519149e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.803981e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.803981e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.386496 sec - 1,286,321,868 cycles # 3.315 GHz - 3,104,502,696 instructions # 2.41 insn per cycle - 0.388829317 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.378534e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.594400e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.594400e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.506766 sec + 1,482,977,233 cycles # 2.909 GHz + 3,119,381,568 instructions # 2.10 insn per cycle + 0.519705447 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2900) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.751909e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.067933e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.067933e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.369266 sec - 1,230,024,804 cycles # 3.316 GHz - 2,957,574,250 instructions # 2.40 insn per cycle - 0.371630014 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.757237e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.033602e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.033602e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.457488 sec + 1,337,095,985 cycles # 2.896 GHz + 2,979,946,273 instructions # 2.23 insn per cycle + 0.473330982 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2670) (512y: 104) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:DBL+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:DBL+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = DOUBLE (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.342815e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.741091e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.741091e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.331348 sec - 1,101,123,467 cycles # 3.308 GHz - 1,929,122,407 instructions # 1.75 insn per cycle - 0.333682550 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.547680e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.672650e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.672650e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.666550 sec + 1,326,556,264 cycles # 1.978 GHz + 1,952,513,162 instructions # 1.47 insn per cycle + 0.681133765 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1348) (512y: 106) (512z: 2173) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_d_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482467492595 Relative difference = 5.286901344678233e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt index 1c0f8553bf..4d40239a82 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_19:15:03 +DATE: 2024-03-01_02:36:34 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.367019e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.211392e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.351303e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 +TOTAL : 0.439896 sec + 1,919,384,660 cycles # 2.928 GHz + 2,652,462,812 instructions # 1.38 insn per cycle + 0.728915663 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.249516e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.812359e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.959123e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 +TOTAL : 0.476459 sec + 2,111,535,021 cycles # 3.010 GHz + 2,984,192,787 instructions # 1.41 insn per cycle + 0.759063881 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424226e-01 +Avg ME (F77/CUDA) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.488352e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.522956e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.522956e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945528e+02 +- 1.186199e+02 ) GeV^-2 -TOTAL : 1.124842 sec - 3,936,915,161 cycles # 3.495 GHz - 13,032,328,324 instructions # 3.31 insn per cycle - 1.127095177 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 748) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.158503e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.184413e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.184413e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.434431 sec + 4,452,862,887 cycles # 3.097 GHz + 13,047,773,125 instructions # 2.93 insn per cycle + 1.440725517 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246857540270419 Relative difference = 1.7265064590569047e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.897446e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.147364e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.147364e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945527e+02 +- 1.186198e+02 ) GeV^-2 -TOTAL : 0.443424 sec - 1,554,112,621 cycles # 3.493 GHz - 4,506,739,300 instructions # 2.90 insn per cycle - 0.445630602 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.101216e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.298192e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.298192e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.547840 sec + 1,698,684,785 cycles # 3.077 GHz + 4,513,142,797 instructions # 2.66 insn per cycle + 0.560862800 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246859631675157 Relative difference = 2.5853054135974944e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.891149e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.978192e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.978192e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 -TOTAL : 0.230449 sec - 770,038,251 cycles # 3.320 GHz - 1,884,223,687 instructions # 2.45 insn per cycle - 0.232708341 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.089458e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.856206e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.856206e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.289099 sec + 853,788,001 cycles # 2.912 GHz + 1,897,231,072 instructions # 2.22 insn per cycle + 0.300313484 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489318272599 Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.376104e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.503945e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.503945e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 -TOTAL : 0.218942 sec - 731,762,064 cycles # 3.320 GHz - 1,799,410,295 instructions # 2.46 insn per cycle - 0.221195439 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.510175e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.400201e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.400201e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.271830 sec + 801,479,133 cycles # 2.904 GHz + 1,820,357,988 instructions # 2.27 insn per cycle + 0.285846070 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489318272599 Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.006759e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.177946e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.177946e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 -TOTAL : 0.186117 sec - 620,244,286 cycles # 3.306 GHz - 1,284,140,386 instructions # 2.07 insn per cycle - 0.188364686 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.997156e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.506085e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.506085e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.349567 sec + 731,841,700 cycles # 2.069 GHz + 1,305,336,291 instructions # 1.78 insn per cycle + 0.359850888 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1968) (512y: 32) (512z: 2383) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489383243206 Relative difference = 4.32888033512879e-08 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt index b407e68a76..441da29ffb 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd0_bridge.txt @@ -1,188 +1,237 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_19:30:44 +DATE: 2024-03-01_03:16:12 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 10 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.711602e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.109045e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.109045e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.017654e+01 +- 1.429184e+01 ) GeV^-2 +TOTAL : 0.448633 sec + 2,014,530,108 cycles # 3.024 GHz + 2,953,646,670 instructions # 1.47 insn per cycle + 0.724573840 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 64 256 1 --bridge +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +WARNING! Set grid in Bridge (nevt=16384, gpublocks=64, gputhreads=256, gpublocks*gputhreads=16384) +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe -p 2048 256 1 --bridge OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +WARNING! Bridge selected: cannot use RamboDevice, will use RamboHost +WARNING! RamboHost selected: cannot use CurandDevice, will use CurandHost +WARNING! Instantiate device Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +WARNING! Set grid in Bridge (nevt=524288, gpublocks=2048, gputhreads=256, gpublocks*gputhreads=524288) +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:FLT+THX:CURHST+RMBHST+BRDDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.194631e+06 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.629307e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.629307e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.609942e+02 +- 2.115590e+02 ) GeV^-2 +TOTAL : 0.616658 sec + 2,563,348,424 cycles # 3.027 GHz + 3,871,269,369 instructions # 1.51 insn per cycle + 0.904047137 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424226e-01 +Avg ME (F77/CUDA) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.484109e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.518859e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.518859e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945528e+02 +- 1.186199e+02 ) GeV^-2 -TOTAL : 1.130053 sec - 3,951,393,993 cycles # 3.492 GHz - 13,036,916,705 instructions # 3.30 insn per cycle - 1.132358928 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 748) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.161555e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.188116e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.188116e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.433803 sec + 4,469,694,345 cycles # 3.110 GHz + 13,052,094,019 instructions # 2.92 insn per cycle + 1.437926738 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 745) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246857540270419 Relative difference = 1.7265064590569047e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.884653e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.132509e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.132509e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945527e+02 +- 1.186198e+02 ) GeV^-2 -TOTAL : 0.447778 sec - 1,569,690,605 cycles # 3.494 GHz - 4,555,017,000 instructions # 2.90 insn per cycle - 0.450120274 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.090515e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.286507e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.286507e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.554057 sec + 1,716,801,013 cycles # 3.079 GHz + 4,560,314,564 instructions # 2.66 insn per cycle + 0.558193661 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3600) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246859631675157 Relative difference = 2.5853054135974944e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.847528e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.908555e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.908555e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 -TOTAL : 0.234405 sec - 783,805,869 cycles # 3.322 GHz - 1,921,295,110 instructions # 2.45 insn per cycle - 0.236707878 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 5.984424e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.738205e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.738205e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.297621 sec + 872,015,724 cycles # 2.894 GHz + 1,933,356,220 instructions # 2.22 insn per cycle + 0.301984624 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3491) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489318272599 Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.304436e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.414872e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.414872e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 -TOTAL : 0.223426 sec - 747,350,874 cycles # 3.322 GHz - 1,836,566,153 instructions # 2.46 insn per cycle - 0.225745389 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.471182e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.343667e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.343667e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.276934 sec + 818,470,682 cycles # 2.917 GHz + 1,856,220,484 instructions # 2.27 insn per cycle + 0.281151541 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3335) (512y: 22) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489318272599 Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe -p 64 256 10 --bridge OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions WARNING! Instantiate host Bridge (nevt=16384) -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+BRDHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+BRDHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.000891e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.167702e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.167702e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 -TOTAL : 0.190037 sec - 635,927,755 cycles # 3.319 GHz - 1,325,913,533 instructions # 2.09 insn per cycle - 0.192251442 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.926101e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.412906e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.412906e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.358667 sec + 751,185,964 cycles # 2.073 GHz + 1,346,032,296 instructions # 1.79 insn per cycle + 0.362975431 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1968) (512y: 32) (512z: 2383) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489383243206 Relative difference = 4.32888033512879e-08 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt index 5b9052bbb4..8918bec5c8 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_f_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_f_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_19:15:10 +DATE: 2024-03-01_02:36:50 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 5.307953e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.201255e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.336658e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.018174e+01 +- 1.429492e+01 ) GeV^-2 +TOTAL : 0.436130 sec + 1,959,442,257 cycles # 3.009 GHz + 2,743,667,126 instructions # 1.40 insn per cycle + 0.720037686 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 167 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:FLT+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = FLOAT (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 7.165076e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.782519e+08 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.922757e+08 ) sec^-1 +MeanMatrixElemValue = ( 2.571361e+02 +- 2.114021e+02 ) GeV^-2 +TOTAL : 0.476114 sec + 2,116,952,174 cycles # 3.025 GHz + 3,000,364,507 instructions # 1.42 insn per cycle + 0.758577490 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424226e-01 +Avg ME (F77/CUDA) = 0.14247488790821983 +Relative difference = 0.00036713209996037764 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.490551e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.525624e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.525624e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945528e+02 +- 1.186199e+02 ) GeV^-2 -TOTAL : 1.123253 sec - 3,928,398,999 cycles # 3.493 GHz - 13,013,265,295 instructions # 3.31 insn per cycle - 1.125553354 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 732) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.155211e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.181167e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.181167e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429903e+01 ) GeV^-2 +TOTAL : 1.438010 sec + 4,446,707,539 cycles # 3.084 GHz + 13,028,651,848 instructions # 2.93 insn per cycle + 1.444314220 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 727) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246857540270419 Relative difference = 1.7265064590569047e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 3.877785e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.124762e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.124762e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.945527e+02 +- 1.186198e+02 ) GeV^-2 -TOTAL : 0.445530 sec - 1,559,423,077 cycles # 3.489 GHz - 4,502,604,517 instructions # 2.89 insn per cycle - 0.447755001 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.098425e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.294299e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.294299e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018563e+01 +- 1.429902e+01 ) GeV^-2 +TOTAL : 0.547784 sec + 1,696,823,876 cycles # 3.074 GHz + 4,509,092,353 instructions # 2.66 insn per cycle + 0.559046282 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3588) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424686e-01 Avg ME (F77/C++) = 0.14246859631675157 Relative difference = 2.5853054135974944e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 7.903710e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 8.988700e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 8.988700e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 -TOTAL : 0.230083 sec - 768,596,123 cycles # 3.320 GHz - 1,881,278,062 instructions # 2.45 insn per cycle - 0.232327253 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.019219e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.763141e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 6.763141e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.292180 sec + 859,590,330 cycles # 2.901 GHz + 1,893,994,453 instructions # 2.20 insn per cycle + 0.304986924 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3461) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489318272599 Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 8.360577e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 9.493697e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 9.493697e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 -TOTAL : 0.219203 sec - 732,515,894 cycles # 3.320 GHz - 1,795,625,953 instructions # 2.45 insn per cycle - 0.221411578 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 6.549494e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.438482e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.438482e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018828e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.269638 sec + 798,515,936 cycles # 2.915 GHz + 1,816,168,831 instructions # 2.27 insn per cycle + 0.281600896 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3298) (512y: 22) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489318272599 Relative difference = 4.784894739577799e-08 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:FLT+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:FLT+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = FLOAT (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[16] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.010951e+06 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.181673e+06 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.181673e+06 ) sec^-1 -MeanMatrixElemValue = ( 1.947128e+02 +- 1.186880e+02 ) GeV^-2 -TOTAL : 0.185361 sec - 618,797,353 cycles # 3.311 GHz - 1,281,956,446 instructions # 2.07 insn per cycle - 0.187631017 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 4.914139e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.405725e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.405725e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018829e+01 +- 1.429922e+01 ) GeV^-2 +TOTAL : 0.355005 sec + 734,840,966 cycles # 2.046 GHz + 1,303,017,912 instructions # 1.77 insn per cycle + 0.365594980 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1931) (512y: 32) (512z: 2383) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_f_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247489383243206 Relative difference = 4.32888033512879e-08 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt index 0e5cce6fad..9473075c44 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd0.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd0' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_19:15:17 +DATE: 2024-03-01_02:37:07 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.657865e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.342545e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.715127e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.444064 sec + 2,011,501,510 cycles # 2.996 GHz + 2,813,725,950 instructions # 1.40 insn per cycle + 0.745188123 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.264913e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 7.129230e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.558122e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.531362 sec + 2,289,898,203 cycles # 2.976 GHz + 3,193,334,828 instructions # 1.39 insn per cycle + 0.827090728 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482577104625 +Relative difference = 5.209967070245855e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.380422e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.409222e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.409222e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.212271 sec - 4,238,626,091 cycles # 3.492 GHz - 13,408,625,650 instructions # 3.16 insn per cycle - 1.214366196 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 836) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.087550e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.110443e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.110443e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.528426 sec + 4,733,772,591 cycles # 3.090 GHz + 13,465,129,433 instructions # 2.84 insn per cycle + 1.534888113 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 840) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.553855e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.651544e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.651544e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.666771 sec - 2,329,497,090 cycles # 3.485 GHz - 7,376,535,398 instructions # 3.17 insn per cycle - 0.669166331 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.994397e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.071792e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.071792e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.843067 sec + 2,603,799,246 cycles # 3.073 GHz + 7,385,481,301 instructions # 2.84 insn per cycle + 0.853727039 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3073) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.590474e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.884942e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.884942e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.381079 sec - 1,268,726,163 cycles # 3.316 GHz - 3,041,731,155 instructions # 2.40 insn per cycle - 0.383435781 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.410870e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.639370e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.639370e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.502006 sec + 1,465,753,503 cycles # 2.896 GHz + 3,056,435,528 instructions # 2.09 insn per cycle + 0.511483566 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 3013) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.865460e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.199799e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.199799e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.361363 sec - 1,203,719,519 cycles # 3.317 GHz - 2,908,941,396 instructions # 2.42 insn per cycle - 0.363683085 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.873726e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.164501e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.164501e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.444397 sec + 1,302,869,174 cycles # 2.905 GHz + 2,931,108,724 instructions # 2.25 insn per cycle + 0.456529729 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2799) (512y: 110) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=0] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=0] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.111885e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.475216e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.475216e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.345104 sec - 1,149,300,201 cycles # 3.315 GHz - 1,946,685,540 instructions # 1.69 insn per cycle - 0.347411360 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.488835e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.605728e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.605728e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.681918 sec + 1,362,782,748 cycles # 1.986 GHz + 1,970,355,079 instructions # 1.45 insn per cycle + 0.693685126 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1700) (512y: 114) (512z: 2171) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd0/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 diff --git a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt index b85d55e45a..f04f8628ac 100644 --- a/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt +++ b/epochX/cudacpp/tput/logs_gqttq_mad/log_gqttq_mad_m_inl0_hrd1.txt @@ -1,183 +1,220 @@ export CUDACPP_RUNTIME_ENABLEFPE=on -Building in /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux +Building in /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux OMPFLAGS=-fopenmp AVX=512y FPTYPE=d HELINL=0 HRDCOD=0 -HASCURAND=hasNoCurand +HASCURAND=hasCurand HASHIPRAND=hasNoHiprand -Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasNoCurand_hasNoHiprand (USEBUILDDIR is set = 1) +Building in BUILDDIR=build.512y_d_inl0_hrd0 for tag=512y_d_inl0_hrd0_hasCurand_hasNoHiprand (USEBUILDDIR is set = 1) make: Nothing to be done for 'gtestlibs'. CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make USEBUILDDIR=1 AVX=none -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.none_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=sse4 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.sse4_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=avx2 -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.avx2_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512y -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512y_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' make USEBUILDDIR=1 AVX=512z -make[1]: Entering directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Entering directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' CUDACPP_BUILDDIR='build.512z_m_inl0_hrd1' make[1]: Nothing to be done for 'all'. -make[1]: Leaving directory '/data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' +make[1]: Leaving directory '/data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux' -DATE: 2024-03-01_19:15:25 +DATE: 2024-03-01_02:37:24 -On itgold91.cern.ch [CPU: Intel(R) Xeon(R) Gold 6326 CPU] [GPU: none]: +On itscrd90.cern.ch [CPU: Intel(R) Xeon(R) Silver 4216 CPU] [GPU: 1x Tesla V100S-PCIE-32GB]: ========================================================================= -Not found: /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 10 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 2.658641e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 5.216275e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 5.578681e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.445224 sec + 1,992,469,002 cycles # 2.992 GHz + 2,813,148,728 instructions # 1.41 insn per cycle + 0.736789901 seconds time elapsed +runNcu /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 64 256 1 +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +==PROF== Profiling "sigmaKin": launch__registers_per_thread 255 +==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 100% +......................................................................... +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe -p 2048 256 1 OMP= +WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions +Process = SIGMA_SM_GUX_TTXUX_CUDA [nvcc 12.0.140 (gcc 11.3.1)] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CUD:MIX+THX:CURDEV+RMBDEV+MESDEV/none+NAVBRK +FP precision = MIXED (NaN/abnormal=0, zero=0) +EvtsPerSec[Rmb+ME] (23) = ( 3.263173e+07 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 6.989199e+07 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 7.385950e+07 ) sec^-1 +MeanMatrixElemValue = ( 2.602505e+02 +- 2.116328e+02 ) GeV^-2 +TOTAL : 0.532147 sec + 2,297,521,664 cycles # 2.990 GHz + 3,210,517,070 instructions # 1.40 insn per cycle + 0.827894226 seconds time elapsed +------------------------------------------------------------------------- +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/gcheck.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fgcheck.exe 2 64 2 +Avg ME (C++/CUDA) = 1.424749e-01 +Avg ME (F77/CUDA) = 0.14247482577104625 +Relative difference = 5.209967070245855e-07 +OK (relative difference <= 5E-3) ========================================================================= -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/none+NAVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/none+NAVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = SCALAR ('none': ~vector[1], no SIMD) -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 1.377531e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 1.405985e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 1.405985e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 1.214658 sec - 4,246,718,650 cycles # 3.492 GHz - 13,407,960,809 instructions # 3.16 insn per cycle - 1.216936825 seconds time elapsed -=Symbols in CPPProcess.o= (~sse4: 826) (avx2: 0) (512y: 0) (512z: 0) +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 1.091329e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 1.113996e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 1.113996e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 1.523445 sec + 4,724,741,346 cycles # 3.094 GHz + 13,451,257,746 instructions # 2.85 insn per cycle + 1.529633779 seconds time elapsed +=Symbols in CPPProcess.o= (~sse4: 827) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.none_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/sse4+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/sse4+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[2] ('sse4': SSE4.2, 128bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 2.553314e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 2.650058e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 2.650058e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.665859 sec - 2,332,786,191 cycles # 3.495 GHz - 7,378,270,783 instructions # 3.16 insn per cycle - 0.668166884 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.010329e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.087455e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.087455e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.835617 sec + 2,595,186,002 cycles # 3.089 GHz + 7,389,201,553 instructions # 2.85 insn per cycle + 0.854907608 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 3062) (avx2: 0) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.sse4_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482734618697 Relative difference = 5.099411406595165e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/avx2+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/avx2+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('avx2': AVX2, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.588001e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 4.879616e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 4.879616e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.381270 sec - 1,269,081,565 cycles # 3.315 GHz - 3,041,458,855 instructions # 2.40 insn per cycle - 0.383572552 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.399802e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 3.624427e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 3.624427e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.503119 sec + 1,466,604,979 cycles # 2.890 GHz + 3,056,260,975 instructions # 2.08 insn per cycle + 0.515296062 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2990) (512y: 0) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.avx2_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512y+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512y+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[4] ('512y': AVX512, 256bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 4.863319e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.195493e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.195493e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.361431 sec - 1,203,784,149 cycles # 3.317 GHz - 2,909,476,469 instructions # 2.42 insn per cycle - 0.363729643 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 3.762321e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 4.040429e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 4.040429e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.457389 sec + 1,310,592,019 cycles # 2.838 GHz + 2,931,897,706 instructions # 2.24 insn per cycle + 0.469608344 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 2775) (512y: 110) (512z: 0) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512y_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07 OK (relative difference <= 5E-3) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe -p 64 256 10 OMP= WARNING! CUDACPP_RUNTIME_ENABLEFPE is set: enable Floating Point Exceptions -Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.4.1] [inlineHel=0] [hardcodePARAM=1] -Workflow summary = CPP:MIX+CXS:COMMON+RMBHST+MESHST/512z+CXVBRK +Process = SIGMA_SM_GUX_TTXUX_CPP [gcc 11.3.1] [inlineHel=0] [hardcodePARAM=1] +Workflow summary = CPP:MIX+CXS:CURHST+RMBHST+MESHST/512z+CXVBRK FP precision = MIXED (NaN/abnormal=0, zero=0) Internal loops fptype_sv = VECTOR[8] ('512z': AVX512, 512bit) [cxtype_ref=YES] -OMP threads / `nproc --all` = 1 / 32 -EvtsPerSec[Rmb+ME] (23) = ( 5.102649e+05 ) sec^-1 -EvtsPerSec[MatrixElems] (3) = ( 5.463171e+05 ) sec^-1 -EvtsPerSec[MECalcOnly] (3a) = ( 5.463171e+05 ) sec^-1 -MeanMatrixElemValue = ( 1.914935e+02 +- 1.163297e+02 ) GeV^-2 -TOTAL : 0.345690 sec - 1,150,403,593 cycles # 3.313 GHz - 1,946,604,480 instructions # 1.69 insn per cycle - 0.347985872 seconds time elapsed +OMP threads / `nproc --all` = 1 / 4 +EvtsPerSec[Rmb+ME] (23) = ( 2.462138e+05 ) sec^-1 +EvtsPerSec[MatrixElems] (3) = ( 2.577756e+05 ) sec^-1 +EvtsPerSec[MECalcOnly] (3a) = ( 2.577756e+05 ) sec^-1 +MeanMatrixElemValue = ( 2.018083e+01 +- 1.429474e+01 ) GeV^-2 +TOTAL : 0.689340 sec + 1,364,202,689 cycles # 1.967 GHz + 1,970,285,028 instructions # 1.44 insn per cycle + 0.699058633 seconds time elapsed =Symbols in CPPProcess.o= (~sse4: 0) (avx2: 1676) (512y: 114) (512z: 2171) ------------------------------------------------------------------------- -runExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe -[ PASSED ] 3 tests. +runExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/runTest.exe +[ PASSED ] 6 tests. ------------------------------------------------------------------------- -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 -cmpExe /data/avalassi/GPU2024/madgraph4gpuX/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/check.exe --common -p 2 64 2 +cmpExe /data/avalassi/GPU2023/madgraph4gpuBis/epochX/cudacpp/gq_ttq.mad/SubProcesses/P1_gux_ttxux/build.512z_m_inl0_hrd1/fcheck.exe 2 64 2 Avg ME (C++/C++) = 1.424749e-01 Avg ME (F77/C++) = 0.14247482643254802 Relative difference = 5.163537715318965e-07